diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
index 7ebf07122f1..a9e4a007290 100644
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@@ -189,6 +189,7 @@ set(TRTLLM_LINK_LIBS
     fb_gemm_src
     gemm_swiglu_sm90_src
     cutlass_src
+    cute_dsl_src
     layers_src
     runtime_src
     testing_src
diff --git a/cpp/tensorrt_llm/kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/CMakeLists.txt
index 74680318170..7cf669de18b 100644
--- a/cpp/tensorrt_llm/kernels/CMakeLists.txt
+++ b/cpp/tensorrt_llm/kernels/CMakeLists.txt
@@ -22,6 +22,8 @@ file(GLOB_RECURSE SRC_CU *.cu)
 # selectiveScan trtllmGenKernels folder
 list(FILTER SRC_CPP EXCLUDE REGEX "cutlass_kernels/.*")
 list(FILTER SRC_CU EXCLUDE REGEX "cutlass_kernels/.*")
+list(FILTER SRC_CPP EXCLUDE REGEX "cuteDslKernels/.*")
+list(FILTER SRC_CU EXCLUDE REGEX "cuteDslKernels/.*")
 list(FILTER SRC_CPP EXCLUDE REGEX "flashMLA/.*")
 list(FILTER SRC_CU EXCLUDE REGEX "flashMLA/.*")
 list(FILTER SRC_CPP EXCLUDE REGEX "contextFusedMultiHeadAttention/.*")
@@ -75,6 +77,7 @@ target_include_directories(
 add_cuda_architectures(kernels_src 89)
 
 add_subdirectory(cutlass_kernels)
+add_subdirectory(cuteDslKernels)
 add_subdirectory(flashMLA)
 add_subdirectory(contextFusedMultiHeadAttention)
 add_subdirectory(decoderMaskedMultiheadAttention)
diff --git a/cpp/tensorrt_llm/kernels/cuteDslKernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/cuteDslKernels/CMakeLists.txt
new file mode 100644
index 00000000000..a718c76c076
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/cuteDslKernels/CMakeLists.txt
@@ -0,0 +1,23 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved. SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+#
+
+file(GLOB_RECURSE SRC_CPP *.cpp)
+file(GLOB_RECURSE SRC_CU *.cu)
+
+add_library(cute_dsl_src OBJECT ${SRC_CPP} ${SRC_CU})
+set_property(TARGET cute_dsl_src PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET cute_dsl_src PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu
new file mode 100644
index 00000000000..32a54662ff1
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/envUtils.h"
+#include "tensorrt_llm/kernels/cuteDslKernels/moeUtils.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh"
+#include "tensorrt_llm/kernels/quantization.cuh"
+#include "tensorrt_llm/kernels/quantization.h"
+
+#include <cuda_fp4.h>
+#include <cute/numeric/numeric_types.hpp>
+
+namespace tensorrt_llm::kernels::cute_dsl
+{
+namespace
+{
+using ElemCopyType = uint4;
+using SFCopyType = uint32_t;
+
+template <typename T>
+auto constexpr bitsPerElem()
+{
+#ifdef ENABLE_FP4
+    return std::is_same_v<T, __nv_fp4_e2m1> ? 4 : cute::sizeof_bits_v<T>;
+#else
+    return cute::sizeof_bits_v<T>;
+#endif
+}
+
+template <typename T>
+auto constexpr elemPerCopy()
+{
+    return bitsPerElem<ElemCopyType>() / bitsPerElem<T>();
+}
+
+template <typename T>
+auto constexpr sfElemPerCopy()
+{
+    return bitsPerElem<SFCopyType>() / bitsPerElem<T>();
+}
+} // namespace
+
+template <typename InputType, typename SFType, int32_t kSFVecSize, int32_t kThreadsPerBlock>
+__global__ void moePermuteKernel(InputType const* input, InputType* permuted_output, SFType const* input_sf,
+    SFType* permuted_sf, int32_t const* tile_idx_to_mn_limit, int32_t const* permuted_idx_to_expanded_idx,
+    int32_t const* num_non_exiting_tiles, int32_t const hidden_size, int32_t const top_k, int32_t const tile_size)
+{
+    int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+    int32_t constexpr kSFElemPerCopy = sfElemPerCopy<SFType>();
+    // Need int64_t to prevent overflow when computing pointer offsets.
+    int64_t const kCopyPerToken = hidden_size / kElemPerCopy;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.wait;");
+#endif
+
+    int32_t const num_tokens = num_non_exiting_tiles[0] * tile_size;
+    for (int32_t permuted_idx = blockIdx.x; permuted_idx < num_tokens; permuted_idx += gridDim.x)
+    {
+        int32_t const tile_idx = permuted_idx / tile_size;
+        if (permuted_idx >= tile_idx_to_mn_limit[tile_idx])
+        {
+            continue;
+        }
+        int32_t const expanded_idx = permuted_idx_to_expanded_idx[permuted_idx];
+        int32_t const token_idx = expanded_idx / top_k;
+
+        auto const* src_ptr = reinterpret_cast<ElemCopyType const*>(input) + token_idx * kCopyPerToken;
+        auto* dst_ptr = reinterpret_cast<ElemCopyType*>(permuted_output) + permuted_idx * kCopyPerToken;
+        for (int32_t i = threadIdx.x; i < kCopyPerToken; i += kThreadsPerBlock)
+        {
+            dst_ptr[i] = src_ptr[i];
+        }
+
+#ifdef ENABLE_FP4
+        if constexpr (std::is_same_v<InputType, __nv_fp4_e2m1>)
+        {
+            int32_t const sf_hidden_size = hidden_size / kSFVecSize;
+            int64_t const kSFCopyPerToken = sf_hidden_size / kSFElemPerCopy;
+            auto const* sf_src_ptr = reinterpret_cast<SFCopyType const*>(input_sf);
+            auto* sf_dst_ptr = reinterpret_cast<SFCopyType*>(permuted_sf);
+            for (int32_t i = threadIdx.x; i < kSFCopyPerToken; i += kThreadsPerBlock)
+            {
+                // input_sf is not swizzled, while permuted_sf is swizzled.
+                int64_t const src_offset = token_idx * kSFCopyPerToken + i;
+                int64_t const dst_offset = get_sf_out_offset_128x4(/* batchIdx= */ std::nullopt, permuted_idx,
+                                               i * kSFElemPerCopy, /* numRows= */ std::nullopt, sf_hidden_size)
+                    / kSFElemPerCopy;
+
+                sf_dst_ptr[dst_offset] = sf_src_ptr[src_offset];
+            }
+        }
+#endif
+    }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename InputType, typename SFType>
+void moePermute(InputType const* input, InputType* permuted_output, SFType const* input_sf, SFType* permuted_sf,
+    int32_t const* tile_idx_to_mn_limit, int32_t const* permuted_idx_to_expanded_idx,
+    int32_t const* num_non_exiting_tiles, int32_t const max_num_permuted_tokens, int32_t const hidden_size,
+    int32_t const top_k, int32_t const tile_size, cudaStream_t stream)
+{
+    int32_t constexpr kThreadsPerBlock = 256;
+    int32_t constexpr kSFVecSize = 16;
+    int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+    TLLM_CHECK_WITH_INFO(hidden_size % kElemPerCopy == 0, "hidden_size must be divisible by %d.", kElemPerCopy);
+
+#ifdef ENABLE_FP4
+    if constexpr (std::is_same_v<InputType, __nv_fp4_e2m1>)
+    {
+        int32_t constexpr kSFMAlignment = 128;
+        int32_t constexpr kSFKAlignment = 4;
+        int32_t constexpr kSFElemPerCopy = sfElemPerCopy<SFType>();
+        static_assert(kSFElemPerCopy == kSFKAlignment);
+        TLLM_CHECK_WITH_INFO(max_num_permuted_tokens % kSFMAlignment == 0,
+            "max_num_permuted_tokens must be divisible by %d.", kSFMAlignment);
+        TLLM_CHECK_WITH_INFO(hidden_size % (kSFVecSize * kSFKAlignment) == 0, "hidden_size must be divisible by %d.",
+            kSFVecSize * kSFKAlignment);
+        TLLM_CHECK_WITH_INFO(input_sf != nullptr, "input_sf is required for NVFP4.");
+        TLLM_CHECK_WITH_INFO(permuted_sf != nullptr, "permuted_sf is required for NVFP4.");
+    }
+#endif
+
+    static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
+    int32_t const blocks = std::min(smCount, max_num_permuted_tokens);
+    int32_t const threads = kThreadsPerBlock;
+
+    auto kernel = &moePermuteKernel<InputType, SFType, kSFVecSize, kThreadsPerBlock>;
+
+    cudaLaunchConfig_t config;
+    config.gridDim = blocks;
+    config.blockDim = threads;
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+    config.numAttrs = 1;
+    config.attrs = attrs;
+    cudaLaunchKernelEx(&config, kernel, input, permuted_output, input_sf, permuted_sf, tile_idx_to_mn_limit,
+        permuted_idx_to_expanded_idx, num_non_exiting_tiles, hidden_size, top_k, tile_size);
+}
+
+#define INSTANTIATE_MOE_PERMUTE(InputType, SFType)                                                                     \
+    template void moePermute<InputType, SFType>(InputType const* input, InputType* permuted_output,                    \
+        SFType const* input_sf, SFType* permuted_sf, int32_t const* tile_idx_to_mn_limit,                              \
+        int32_t const* permuted_idx_to_expanded_idx, int32_t const* num_non_exiting_tiles,                             \
+        int32_t const max_num_permuted_tokens, int32_t const hidden_size, int32_t const top_k,                         \
+        int32_t const tile_size, cudaStream_t stream)
+
+INSTANTIATE_MOE_PERMUTE(half, uint8_t);
+#ifdef ENABLE_BF16
+INSTANTIATE_MOE_PERMUTE(__nv_bfloat16, uint8_t);
+#endif
+#ifdef ENABLE_FP8
+INSTANTIATE_MOE_PERMUTE(__nv_fp8_e4m3, uint8_t);
+#endif
+#ifdef ENABLE_FP4
+INSTANTIATE_MOE_PERMUTE(__nv_fp4_e2m1, uint8_t);
+#endif
+#undef INSTANTIATE_MOE_PERMUTE
+
+template <typename InputType, typename TopKScaleType, int32_t kThreadsPerBlock>
+__global__ void moeUnpermuteKernel(InputType const* permuted_input, InputType* output,
+    int32_t const* expanded_idx_to_permuted_idx, TopKScaleType const* topk_scales, int32_t const hidden_size,
+    int32_t const top_k)
+{
+    using AccumType = float;
+    int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+    // Need int64_t to prevent overflow when computing pointer offsets.
+    int64_t const kCopyPerToken = hidden_size / kElemPerCopy;
+    InputType rmem[kElemPerCopy];
+    AccumType rmemAccum[kElemPerCopy];
+
+    int32_t const token_idx = blockIdx.x;
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.wait;");
+#endif
+
+    auto* dst_ptr = reinterpret_cast<ElemCopyType*>(output) + token_idx * kCopyPerToken;
+    for (int32_t i = threadIdx.x; i < kCopyPerToken; i += kThreadsPerBlock)
+    {
+#pragma unroll
+        for (int32_t j = 0; j < kElemPerCopy; j++)
+        {
+            rmemAccum[j] = 0;
+        }
+        for (int32_t k = 0; k < top_k; k++)
+        {
+            int32_t const permuted_idx = expanded_idx_to_permuted_idx[token_idx * top_k + k];
+            if (permuted_idx < 0)
+            {
+                continue;
+            }
+            auto const* src_ptr = reinterpret_cast<ElemCopyType const*>(permuted_input) + permuted_idx * kCopyPerToken;
+            *reinterpret_cast<ElemCopyType*>(rmem) = src_ptr[i];
+            TopKScaleType const scale = topk_scales[token_idx * top_k + k];
+
+#pragma unroll
+            for (int32_t j = 0; j < kElemPerCopy; j++)
+            {
+                rmemAccum[j] += static_cast<AccumType>(rmem[j]) * static_cast<AccumType>(scale);
+            }
+        }
+#pragma unroll
+        for (int32_t j = 0; j < kElemPerCopy; j++)
+        {
+            rmem[j] = static_cast<InputType>(rmemAccum[j]);
+        }
+        dst_ptr[i] = *reinterpret_cast<ElemCopyType*>(rmem);
+    }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename InputType, typename TopKScaleType>
+void moeUnpermute(InputType const* permuted_input, InputType* output, int32_t const* expanded_idx_to_permuted_idx,
+    TopKScaleType const* topk_scales, int32_t const num_tokens, int32_t const hidden_size, int32_t const top_k,
+    cudaStream_t stream)
+{
+    int32_t constexpr kThreadsPerBlock = 256;
+    int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+    TLLM_CHECK_WITH_INFO(hidden_size % kElemPerCopy == 0, "hidden_size must be divisible by %d.", kElemPerCopy);
+
+    int32_t const blocks = num_tokens;
+    int32_t const threads = kThreadsPerBlock;
+
+    auto kernel = &moeUnpermuteKernel<InputType, TopKScaleType, kThreadsPerBlock>;
+
+    cudaLaunchConfig_t config;
+    config.gridDim = blocks;
+    config.blockDim = threads;
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+    config.numAttrs = 1;
+    config.attrs = attrs;
+    cudaLaunchKernelEx(
+        &config, kernel, permuted_input, output, expanded_idx_to_permuted_idx, topk_scales, hidden_size, top_k);
+}
+
+#define INSTANTIATE_MOE_UNPERMUTE(InputType, TopKScaleType)                                                            \
+    template void moeUnpermute<InputType>(InputType const* permuted_input, InputType* output,                          \
+        int32_t const* expanded_idx_to_permuted_idx, TopKScaleType const* topk_scales, int32_t const num_tokens,       \
+        int32_t const hidden_size, int32_t const top_k, cudaStream_t stream)
+
+INSTANTIATE_MOE_UNPERMUTE(half, float);
+INSTANTIATE_MOE_UNPERMUTE(half, half);
+#ifdef ENABLE_BF16
+INSTANTIATE_MOE_UNPERMUTE(__nv_bfloat16, float);
+INSTANTIATE_MOE_UNPERMUTE(__nv_bfloat16, __nv_bfloat16);
+#endif
+#undef INSTANTIATE_MOE_UNPERMUTE
+
+template <typename InputType, typename OutputType, typename SFType, int32_t kSFVecSize, typename ActFn,
+    int32_t kThreadsPerBlock>
+__global__ void moeActivationKernel(InputType const* input, OutputType* output, float const* global_sf,
+    SFType* output_sf, int32_t const* tile_idx_to_mn_limit, int32_t const* num_non_exiting_tiles,
+    int32_t const interm_size, int32_t const tile_size)
+{
+    using ComputeType = float;
+#ifdef ENABLE_FP4
+    using ElemOutputCopyType = std::conditional_t<std::is_same_v<OutputType, __nv_fp4_e2m1>, uint32_t, ElemCopyType>;
+#else
+    using ElemOutputCopyType = ElemCopyType;
+#endif
+    int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+    // Need int64_t to prevent overflow when computing pointer offsets.
+    int64_t const kCopyPerToken = interm_size / kElemPerCopy;
+    InputType rmem[kElemPerCopy];
+    InputType rmemGate[kElemPerCopy];
+    ActFn act{};
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.wait;");
+#endif
+
+    float global_sf_val = global_sf == nullptr ? 1.0f : global_sf[0];
+
+    int32_t const num_tokens = num_non_exiting_tiles[0] * tile_size;
+    for (int32_t permuted_idx = blockIdx.x; permuted_idx < num_tokens; permuted_idx += gridDim.x)
+    {
+        int32_t const tile_idx = permuted_idx / tile_size;
+        if (permuted_idx >= tile_idx_to_mn_limit[tile_idx])
+        {
+            continue;
+        }
+        auto const* src_ptr
+            = reinterpret_cast<ElemCopyType const*>(input) + permuted_idx * kCopyPerToken * (ActFn::IS_GLU ? 2 : 1);
+        auto* dst_ptr = reinterpret_cast<ElemOutputCopyType*>(output) + permuted_idx * kCopyPerToken;
+        for (int32_t i = threadIdx.x; i < kCopyPerToken; i += kThreadsPerBlock)
+        {
+            *reinterpret_cast<ElemCopyType*>(rmem) = src_ptr[i];
+            if constexpr (ActFn::IS_GLU)
+            {
+                *reinterpret_cast<ElemCopyType*>(rmemGate) = src_ptr[i + kCopyPerToken];
+#pragma unroll
+                for (int32_t j = 0; j < kElemPerCopy; j++)
+                {
+                    rmem[j] = static_cast<InputType>(
+                        act(static_cast<ComputeType>(rmemGate[j]), static_cast<ComputeType>(rmem[j])));
+                }
+            }
+            else
+            {
+#pragma unroll
+                for (int32_t j = 0; j < kElemPerCopy; j++)
+                {
+                    rmem[j] = static_cast<InputType>(act(static_cast<ComputeType>(rmem[j])));
+                }
+            }
+
+#ifdef ENABLE_FP4
+            if constexpr (std::is_same_v<OutputType, __nv_fp4_e2m1>)
+            {
+                auto* sf_dst_ptr = cvt_quant_get_sf_out_offset<SFType, kSFVecSize / kElemPerCopy>(
+                    /* batchIdx= */ std::nullopt, permuted_idx, i, /*numRows=*/std::nullopt, interm_size / kSFVecSize,
+                    output_sf, QuantizationSFLayout::SWIZZLED);
+                dst_ptr[i] = cvt_warp_fp16_to_fp4<InputType, kSFVecSize, false>(
+                    *reinterpret_cast<PackedVec<InputType>*>(rmem), global_sf_val, sf_dst_ptr);
+            }
+            else
+#endif
+            {
+                dst_ptr[i] = *reinterpret_cast<ElemCopyType*>(rmem);
+            }
+        }
+    }
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename InputType, typename OutputType, typename SFType>
+void moeActivation(InputType const* input, OutputType* output, float const* global_sf, SFType* output_sf,
+    int32_t const* tile_idx_to_mn_limit, int32_t const* num_non_exiting_tiles,
+    cutlass_kernels::ActivationParams activation_params, int32_t const max_num_permuted_tokens,
+    int32_t const interm_size, int32_t const tile_size, cudaStream_t stream)
+{
+    int32_t constexpr kThreadsPerBlock = 256;
+    int32_t constexpr kSFVecSize = 16;
+    int32_t constexpr kElemPerCopy = elemPerCopy<InputType>();
+    TLLM_CHECK_WITH_INFO(interm_size % kElemPerCopy == 0, "interm_size must be divisible by %d.", kElemPerCopy);
+
+#ifdef ENABLE_FP4
+    if constexpr (std::is_same_v<InputType, __nv_fp4_e2m1>)
+    {
+        int32_t constexpr kSFMAlignment = 128;
+        int32_t constexpr kSFKAlignment = 4;
+        TLLM_CHECK_WITH_INFO(max_num_permuted_tokens % kSFMAlignment == 0,
+            "max_num_permuted_tokens must be divisible by %d.", kSFMAlignment);
+        TLLM_CHECK_WITH_INFO(interm_size % (kSFVecSize * kSFKAlignment) == 0, "interm_size must be divisible by %d.",
+            kSFVecSize * kSFKAlignment);
+        TLLM_CHECK_WITH_INFO(global_sf != nullptr, "global_sf is required for NVFP4.");
+        TLLM_CHECK_WITH_INFO(output_sf != nullptr, "output_sf is required for NVFP4.");
+    }
+#endif
+
+    static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount();
+    int32_t const blocks = std::min(smCount, max_num_permuted_tokens);
+    int32_t const threads = kThreadsPerBlock;
+
+    auto kernel_array
+        = std::array{&moeActivationKernel<InputType, OutputType, SFType, kSFVecSize,
+                         cutlass_kernels::IdentityAdaptor<cutlass::epilogue::thread::GELU>, kThreadsPerBlock>,
+            &moeActivationKernel<InputType, OutputType, SFType, kSFVecSize,
+                cutlass_kernels::IdentityAdaptor<cutlass::epilogue::thread::ReLu>, kThreadsPerBlock>,
+            &moeActivationKernel<InputType, OutputType, SFType, kSFVecSize,
+                cutlass_kernels::IdentityAdaptor<cutlass::epilogue::thread::SiLu>, kThreadsPerBlock>,
+            &moeActivationKernel<InputType, OutputType, SFType, kSFVecSize,
+                cutlass_kernels::GLUAdaptor<cutlass::epilogue::thread::SiLu>, kThreadsPerBlock>,
+            &moeActivationKernel<InputType, OutputType, SFType, kSFVecSize,
+                cutlass_kernels::GLUAdaptor<cutlass::epilogue::thread::GELU>, kThreadsPerBlock>,
+            &moeActivationKernel<InputType, OutputType, SFType, kSFVecSize, cutlass_kernels::SwigluBiasAdaptor,
+                kThreadsPerBlock>,
+            &moeActivationKernel<InputType, OutputType, SFType, kSFVecSize,
+                cutlass_kernels::IdentityAdaptor<cutlass::epilogue::thread::Identity>, kThreadsPerBlock>};
+
+    auto kernel = kernel_array[static_cast<int32_t>(activation_params.activation_type)];
+
+    cudaLaunchConfig_t config;
+    config.gridDim = blocks;
+    config.blockDim = threads;
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL();
+    config.numAttrs = 1;
+    config.attrs = attrs;
+    cudaLaunchKernelEx(&config, kernel, input, output, global_sf, output_sf, tile_idx_to_mn_limit,
+        num_non_exiting_tiles, interm_size, tile_size);
+}
+
+#define INSTANTIATE_MOE_ACTIVATION(InputType, OutputType, SFType)                                                      \
+    template void moeActivation<InputType, OutputType, SFType>(InputType const* input, OutputType* output,             \
+        float const* global_sf, SFType* output_sf, int32_t const* tile_idx_to_mn_limit,                                \
+        int32_t const* num_non_exiting_tiles, cutlass_kernels::ActivationParams activation_params,                     \
+        int32_t const max_num_permuted_tokens, int32_t const interm_size, int32_t const tile_size,                     \
+        cudaStream_t stream)
+
+INSTANTIATE_MOE_ACTIVATION(half, half, uint8_t);
+#ifdef ENABLE_BF16
+INSTANTIATE_MOE_ACTIVATION(__nv_bfloat16, __nv_bfloat16, uint8_t);
+#endif
+#ifdef ENABLE_FP4
+INSTANTIATE_MOE_ACTIVATION(half, __nv_fp4_e2m1, uint8_t);
+#ifdef ENABLE_BF16
+INSTANTIATE_MOE_ACTIVATION(__nv_bfloat16, __nv_fp4_e2m1, uint8_t);
+#endif
+#endif
+#undef INSTANTIATE_MOE_ACTIVATION
+
+} // namespace tensorrt_llm::kernels::cute_dsl
diff --git a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h
new file mode 100644
index 00000000000..0659b4c78f6
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h"
+#include <cstdint>
+#include <cuda_runtime.h>
+
+namespace tensorrt_llm::kernels::cute_dsl
+{
+template <typename InputType, typename SFType>
+void moePermute(InputType const* input, InputType* permuted_output, SFType const* input_sf, SFType* permuted_sf,
+    int32_t const* tile_idx_to_mn_limit, int32_t const* permuted_idx_to_expanded_idx,
+    int32_t const* num_non_exiting_tiles, int32_t const max_num_permuted_tokens, int32_t const hidden_size,
+    int32_t const top_k, int32_t const tile_size, cudaStream_t stream);
+
+template <typename InputType, typename TopKScaleType>
+void moeUnpermute(InputType const* permuted_input, InputType* output, int32_t const* expanded_idx_to_permuted_idx,
+    TopKScaleType const* topk_scales, int32_t const num_tokens, int32_t const hidden_size, int32_t const top_k,
+    cudaStream_t stream);
+
+template <typename InputType, typename OutputType, typename SFType>
+void moeActivation(InputType const* input, OutputType* output, float const* global_sf, SFType* output_sf,
+    int32_t const* tile_idx_to_mn_limit, int32_t const* num_non_exiting_tiles,
+    cutlass_kernels::ActivationParams activation_params, int32_t const max_num_permuted_tokens,
+    int32_t const interm_size, int32_t const tile_size, cudaStream_t stream);
+
+} // namespace tensorrt_llm::kernels::cute_dsl
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
index 3d9ee19d654..901ecbfff64 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu
@@ -37,7 +37,6 @@
 #include "cutlass/util/packed_stride.hpp"
 
 #include "cutlass/array.h"
-#include "cutlass/epilogue/thread/activation.h"
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/numeric_types.h"
 
@@ -52,6 +51,7 @@
 #include "tensorrt_llm/common/dataType.h"
 #include "tensorrt_llm/common/envUtils.h"
 #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h"
+#include "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh"
 #include "tensorrt_llm/kernels/moe_utils.cuh"
 #include "tensorrt_llm/kernels/preQuantScaleKernel.h"
 #include "tensorrt_llm/kernels/quantization.cuh"
@@ -1344,7 +1344,7 @@ __host__ __device__ constexpr static U arrayConvert(T const& input)
     return converter(input);
 }
 
-// Duplicated and permutes rows for MoE. In addition, reverse the permutation map to help with finalizing routing.
+// Duplicated and permutes rows for MoE.
 
 // "expanded_x_row" simply means that the number of values is num_rows x k. It is "expanded" since we will have to
 // duplicate some rows in the input matrix to match the dimensions. Duplicates will always get routed to separate
@@ -1937,56 +1937,6 @@ INSTANTIATE_FINALIZE_MOE_ROUTING(float, float, float);
 INSTANTIATE_FINALIZE_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, __nv_bfloat16);
 #endif
 
-// ============================== Activation Adaptors =================================
-template <template <class> class ActFn>
-struct IdentityAdaptor
-{
-    constexpr static bool IS_GLU = false;
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    float limit = std::numeric_limits<float>::infinity();
-
-    template <class T>
-    __device__ T operator()(T const& x) const
-    {
-        ActFn<T> fn{};
-        return fn(x);
-    }
-};
-
-template <template <class> class ActFn>
-struct GLUAdaptor
-{
-    constexpr static bool IS_GLU = true;
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    float limit = std::numeric_limits<float>::infinity();
-
-    template <class T>
-    __device__ T operator()(T const& gate, T const& linear) const
-    {
-        ActFn<T> fn{};
-        return fn(gate) * linear;
-    }
-};
-
-struct SwigluBiasAdaptor
-{
-    constexpr static bool IS_GLU = true;
-    float alpha = 1.0f;
-    float beta = 0.0f;
-    float limit = std::numeric_limits<float>::infinity();
-
-    template <class T>
-    __device__ T operator()(T const& gate, T const& linear) const
-    {
-        cutlass::epilogue::thread::Sigmoid<T> fn{};
-        T linear_clamped = cutlass::maximum<T>{}(cutlass::minimum<T>{}(linear, limit), -limit);
-        T gate_clamped = cutlass::minimum<T>{}(gate, limit);
-        return gate_clamped * fn(gate_clamped * alpha) * (linear_clamped + beta);
-    }
-};
-
 // ============================== Gated Activation =================================
 constexpr static int ACTIVATION_THREADS_PER_BLOCK = 256;
 
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh
new file mode 100644
index 00000000000..0a752f7b1f5
--- /dev/null
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <limits>
+
+#include "cutlass/epilogue/thread/activation.h"
+
+namespace tensorrt_llm::kernels::cutlass_kernels
+{
+// ============================== Activation Adaptors =================================
+
+template <template <class> class ActFn>
+struct IdentityAdaptor
+{
+    constexpr static bool IS_GLU = false;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float limit = std::numeric_limits<float>::infinity();
+
+    template <class T>
+    __device__ T operator()(T const& x) const
+    {
+        ActFn<T> fn{};
+        return fn(x);
+    }
+};
+
+template <template <class> class ActFn>
+struct GLUAdaptor
+{
+    constexpr static bool IS_GLU = true;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float limit = std::numeric_limits<float>::infinity();
+
+    template <class T>
+    __device__ T operator()(T const& gate, T const& linear) const
+    {
+        ActFn<T> fn{};
+        return fn(gate) * linear;
+    }
+};
+
+struct SwigluBiasAdaptor
+{
+    constexpr static bool IS_GLU = true;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    float limit = std::numeric_limits<float>::infinity();
+
+    template <class T>
+    __device__ T operator()(T const& gate, T const& linear) const
+    {
+        cutlass::epilogue::thread::Sigmoid<T> fn{};
+        T linear_clamped = cutlass::maximum<T>{}(cutlass::minimum<T>{}(linear, limit), -limit);
+        T gate_clamped = cutlass::minimum<T>{}(gate, limit);
+        return gate_clamped * fn(gate_clamped * alpha) * (linear_clamped + beta);
+    }
+};
+
+} // namespace tensorrt_llm::kernels::cutlass_kernels
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu
index 82d28dfa2db..a461ba5d38f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu
@@ -527,6 +527,10 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts) routingIndicesCoo
         {
             params.mPtrExpandedIdxToPermutedIdx[expandedIdx] = permutedIdx;
         }
+        if (params.mPtrPermutedIdxToExpandedIdx != nullptr && isLocalExpert)
+        {
+            params.mPtrPermutedIdxToExpandedIdx[permutedIdx] = expandedIdx;
+        }
         if (params.mPtrPermutedIdxToTokenIdx != nullptr && isLocalExpert)
         {
             params.mPtrPermutedIdxToTokenIdx[permutedIdx] = tokenIdx;
@@ -593,7 +597,8 @@ void run(Data& data, void* stream)
         TLLM_CHECK_WITH_INFO(data.mPtrTopKWeights != nullptr,
             "When mPtrTopKIds is provided, mPtrTopKWeights must also be provided for DeepSeek routing.");
     }
-    if (data.mPtrExpandedIdxToPermutedIdx != nullptr || data.mPtrPermutedIdxToTokenIdx != nullptr)
+    if (data.mPtrExpandedIdxToPermutedIdx != nullptr || data.mPtrPermutedIdxToExpandedIdx != nullptr
+        || data.mPtrPermutedIdxToTokenIdx != nullptr)
         TLLM_CHECK_WITH_INFO(
             (data.mPtrTopKPacked != nullptr || data.mPtrTopKIds != nullptr) && data.mPtrPermutedIdxSize,
             "If permuted index is required, `mPtrTopKPacked` or `mPtrTopKIds` is also required");
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cuh b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cuh
index e2431e29f29..82e51ce3e34 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cuh
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.cuh
@@ -474,6 +474,10 @@ __device__ void routingPermutation(KernelParams params, PackedScoreIdx<BaseType>
         {
             params.mPtrExpandedIdxToPermutedIdx[expandedIdx] = permutedIdx;
         }
+        if (params.mPtrPermutedIdxToExpandedIdx != nullptr && isLocalExpert)
+        {
+            params.mPtrPermutedIdxToExpandedIdx[permutedIdx] = expandedIdx;
+        }
         if (params.mPtrPermutedIdxToTokenIdx != nullptr && isLocalExpert)
         {
             params.mPtrPermutedIdxToTokenIdx[permutedIdx] = tokenIdx;
@@ -840,6 +844,10 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts) routingIndicesOff
             {
                 params.mPtrExpandedIdxToPermutedIdx[expandedIdx] = permutedIdx;
             }
+            if (params.mPtrPermutedIdxToExpandedIdx != nullptr && isLocalExpert)
+            {
+                params.mPtrPermutedIdxToExpandedIdx[permutedIdx] = expandedIdx;
+            }
             if (params.mPtrPermutedIdxToTokenIdx != nullptr && isLocalExpert)
             {
                 params.mPtrPermutedIdxToTokenIdx[permutedIdx] = tokenIdx;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.h
index a51328a94d7..d5aed6dbc9f 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingKernel.h
@@ -55,6 +55,9 @@ struct DataBase
     int32_t* mPtrExpandedIdxToPermutedIdx{nullptr};
     // optional: if `nullptr`, it is not filled
     // dim: [mTileTokensDim * mTopK + (mNumExperts × mTileTokensDim) - mNumExperts]
+    int32_t* mPtrPermutedIdxToExpandedIdx{nullptr};
+    // optional: if `nullptr`, it is not filled
+    // dim: [mTileTokensDim * mTopK + (mNumExperts × mTileTokensDim) - mNumExperts]
     // Note: this array (mPtrPermutedIdxToTokenIdx) is uninitialized
     // Any out-of-bounds values are undefined.
     int32_t* mPtrPermutedIdxToTokenIdx{nullptr};
@@ -119,6 +122,7 @@ struct KernelParamsBase
     int32_t* mPtrExpertCounts = nullptr;
     int32_t* mPtrPermutedIdxSize = nullptr;
     int32_t* mPtrExpandedIdxToPermutedIdx = nullptr;
+    int32_t* mPtrPermutedIdxToExpandedIdx = nullptr;
     int32_t* mPtrPermutedIdxToTokenIdx = nullptr;
     int32_t* mPtrCtaIdxXyToBatchIdx = nullptr;
     int32_t* mPtrCtaIdxXyToMnLimit = nullptr;
@@ -144,6 +148,7 @@ struct KernelParamsBase
         mPtrExpertCounts = data.mPtrExpertCounts;
         mPtrPermutedIdxSize = data.mPtrPermutedIdxSize;
         mPtrExpandedIdxToPermutedIdx = data.mPtrExpandedIdxToPermutedIdx;
+        mPtrPermutedIdxToExpandedIdx = data.mPtrPermutedIdxToExpandedIdx;
         mPtrPermutedIdxToTokenIdx = data.mPtrPermutedIdxToTokenIdx;
         mPtrCtaIdxXyToBatchIdx = data.mPtrCtaIdxXyToBatchIdx;
         mPtrCtaIdxXyToMnLimit = data.mPtrCtaIdxXyToMnLimit;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingLlama4.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingLlama4.cu
index f0b7f124c1e..be4e0e49372 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingLlama4.cu
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingLlama4.cu
@@ -348,6 +348,11 @@ __global__ void __launch_bounds__(WarpSize) routingIndicesWarpKernel(KernelParam
             {
                 params.mPtrExpandedIdxToPermutedIdx[tokenIdx] = permutedIdx;
             }
+            // write out `mPtrPermutedIdxToExpandedIdx` if required
+            if (params.mPtrPermutedIdxToExpandedIdx != nullptr && isLocalExpert)
+            {
+                params.mPtrPermutedIdxToExpandedIdx[permutedIdx] = tokenIdx;
+            }
             // write out `mPtrPermutedIdxToTokenIdx` if required
             if (params.mPtrPermutedIdxToTokenIdx != nullptr && isLocalExpert && isTokenRouted)
             {
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu
index a2988863937..7a9cc1f7323 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu
@@ -276,8 +276,15 @@ __global__ void __launch_bounds__(KernelParams::MaxNumExperts) routingIndicesBlo
             int const offsetForExpert = expertScanCounts;
             int const permutedIdx = isLocalExpert ? offsetForExpert + offsetWithinExpert : int32_t{-1};
 
-            params.mPtrExpandedIdxToPermutedIdx[expandedIdx] = permutedIdx;
-            if (isLocalExpert)
+            if (params.mPtrExpandedIdxToPermutedIdx != nullptr)
+            {
+                params.mPtrExpandedIdxToPermutedIdx[expandedIdx] = permutedIdx;
+            }
+            if (params.mPtrPermutedIdxToExpandedIdx != nullptr && isLocalExpert)
+            {
+                params.mPtrPermutedIdxToExpandedIdx[permutedIdx] = expandedIdx;
+            }
+            if (params.mPtrPermutedIdxToTokenIdx != nullptr && isLocalExpert)
             {
                 params.mPtrPermutedIdxToTokenIdx[permutedIdx] = tokenIdx;
             }
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu
index 728a9d515bf..ba5821a8d27 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu
@@ -80,6 +80,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
         routingData.mPtrExpertCounts = expertCountHistogram;
         routingData.mPtrPermutedIdxSize = permutedIdxSize;
         routingData.mPtrExpandedIdxToPermutedIdx = expandedIdxToPermutedIdx;
+        routingData.mPtrPermutedIdxToExpandedIdx = permutedIdxToExpandedIdx;
         routingData.mPtrPermutedIdxToTokenIdx = permutedIdxToTokenIdx;
         routingData.mPtrTopKWeights = expertWeights;
 
@@ -122,6 +123,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
         routingData.mPtrExpertCounts = expertCountHistogram;
         routingData.mPtrPermutedIdxSize = permutedIdxSize;
         routingData.mPtrExpandedIdxToPermutedIdx = expandedIdxToPermutedIdx;
+        routingData.mPtrPermutedIdxToExpandedIdx = permutedIdxToExpandedIdx;
         routingData.mPtrPermutedIdxToTokenIdx = permutedIdxToTokenIdx;
         routingData.mPtrTopKWeights = expertWeights;
 
@@ -177,6 +179,7 @@ void Runner::run(void* routingLogits, void* routingBias, int32_t numTokens, int3
         routingData.mPtrExpertCounts = expertCountHistogram;
         routingData.mPtrPermutedIdxSize = permutedIdxSize;
         routingData.mPtrExpandedIdxToPermutedIdx = expandedIdxToPermutedIdx;
+        routingData.mPtrPermutedIdxToExpandedIdx = permutedIdxToExpandedIdx;
         routingData.mPtrPermutedIdxToTokenIdx = permutedIdxToTokenIdx;
         routingData.mPtrTopKWeights = expertWeights;
         routingData.mPtrTopKIds = expertIds;
diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt
index 30cc2eca6d7..40253d87535 100644
--- a/cpp/tensorrt_llm/thop/CMakeLists.txt
+++ b/cpp/tensorrt_llm/thop/CMakeLists.txt
@@ -44,6 +44,7 @@ add_library(
   attentionOp.cpp
   causalConv1dOp.cpp
   convertSpecDecodingMaskToPackedMaskOp.cpp
+  cuteDslMoeUtilsOp.cpp
   cutlassScaledMM.cpp
   cublasScaledMM.cpp
   cublasFp4ScaledMM.cpp
diff --git a/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp b/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp
new file mode 100644
index 00000000000..b48a5b73dcb
--- /dev/null
+++ b/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/cuteDslKernels/moeUtils.h"
+#include "tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h"
+#include "tensorrt_llm/thop/thUtils.h"
+
+#include <cuda_fp4.h>
+
+namespace torch_ext
+{
+// Sort
+using tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::RoutingMethodType;
+
+std::vector<torch::Tensor> moe_topk_sort_impl(torch::optional<torch::Tensor> const& routing_logits,
+    torch::optional<torch::Tensor> const& routing_bias, torch::optional<torch::Tensor> const& token_selected_experts,
+    torch::optional<torch::Tensor> const& token_final_scales, int64_t const num_experts, int64_t const top_k,
+    std::optional<int64_t> const n_group, std::optional<int64_t> const topk_group, int64_t const local_expert_offset,
+    int64_t const local_num_experts, std::optional<double> const routed_scaling_factor, int64_t const tile_tokens_dim,
+    RoutingMethodType const routing_method_type)
+{
+    int64_t const num_tokens
+        = token_selected_experts.has_value() ? token_selected_experts->size(0) : routing_logits->size(0);
+    int64_t const max_num_padded_tokens
+        = tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::getMaxPermutedPaddedCount(
+            num_tokens, top_k, local_num_experts, tile_tokens_dim);
+    int64_t const max_num_ctas = tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::getMaxNumCtasInBatchDim(
+        num_tokens, top_k, local_num_experts, tile_tokens_dim);
+    int64_t const size_of_expert_count_histogram = std::max(num_experts * 2, int64_t(256 * 2));
+    auto const routing_bias_dtype = routing_bias.has_value() ? routing_bias->scalar_type() : torch::kBFloat16;
+
+    auto routing_logits_ptr = routing_logits.has_value() ? routing_logits->data_ptr() : nullptr;
+    auto routing_bias_ptr = routing_bias.has_value() ? routing_bias->data_ptr() : nullptr;
+    auto token_selected_experts_ptr
+        = token_selected_experts.has_value() ? token_selected_experts->data_ptr<int32_t>() : nullptr;
+    auto token_final_scales_ptr = token_final_scales.has_value() ? token_final_scales->data_ptr() : nullptr;
+
+    torch::optional<torch::Tensor> new_token_final_scales;
+    if (token_final_scales_ptr == nullptr)
+    {
+        new_token_final_scales
+            = torch::empty({num_tokens, top_k}, torch::dtype(routing_bias_dtype).device(torch::kCUDA));
+        token_final_scales_ptr = new_token_final_scales->data_ptr();
+    }
+
+    auto expert_indexes = torch::empty({num_tokens, top_k}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    auto expert_count_histogram
+        = torch::empty({size_of_expert_count_histogram}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    auto total_num_padded_tokens = torch::empty({1}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    auto expanded_idx_to_permuted_idx
+        = torch::empty({num_tokens, top_k}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    auto permuted_idx_to_expanded_idx
+        = torch::empty({max_num_padded_tokens}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    auto num_tokens_per_expert = torch::empty({num_experts}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    auto tile_idx_to_expert_idx = torch::empty({max_num_ctas}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    auto tile_idx_to_mn_limit = torch::empty({max_num_ctas}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+    auto num_non_exiting_tiles = torch::empty({1}, torch::dtype(torch::kInt32).device(torch::kCUDA));
+
+    tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::Routing::Runner routing_runner(tile_tokens_dim);
+    auto const& stream = at::cuda::getCurrentCUDAStream(
+        routing_logits.has_value() ? routing_logits->get_device() : token_selected_experts->get_device());
+    routing_runner.run(routing_logits_ptr, routing_bias_ptr, num_tokens, num_experts, top_k, n_group.value_or(0),
+        topk_group.value_or(0), local_expert_offset, local_num_experts, routed_scaling_factor.value_or(1.0),
+        expert_indexes.data_ptr<int>(), expert_count_histogram.data_ptr<int>(), total_num_padded_tokens.data_ptr<int>(),
+        expanded_idx_to_permuted_idx.data_ptr<int>(), permuted_idx_to_expanded_idx.data_ptr<int>(),
+        nullptr /*permuted_idx_to_token_idx.data_ptr<int>()*/, token_final_scales_ptr, token_selected_experts_ptr,
+        num_tokens_per_expert.data_ptr<int>(), tile_idx_to_expert_idx.data_ptr<int>(),
+        tile_idx_to_mn_limit.data_ptr<int>(), num_non_exiting_tiles.data_ptr<int>(),
+        batchedGemm::trtllm::gen::Dtype::Void /* dtypeElt */, false /* use_routing_scales_on_input */,
+        false /* use_deep_seek_fp8 */, routing_method_type, stream);
+
+    std::vector<torch::Tensor> results{tile_idx_to_expert_idx, tile_idx_to_mn_limit, expanded_idx_to_permuted_idx,
+        permuted_idx_to_expanded_idx, total_num_padded_tokens, num_non_exiting_tiles};
+    if (new_token_final_scales.has_value())
+    {
+        results.push_back(new_token_final_scales.value());
+    }
+    return results;
+}
+
+std::vector<torch::Tensor> moe_topk_sort(torch::Tensor const& routing_logits,
+    torch::optional<torch::Tensor> const& routing_bias, int64_t const num_experts, int64_t const top_k,
+    std::optional<int64_t> const n_group, std::optional<int64_t> const topk_group, int64_t const local_expert_offset,
+    int64_t const local_num_experts, std::optional<double> const routed_scaling_factor, int64_t const tile_tokens_dim,
+    int64_t const routing_method_type)
+{
+    TORCH_CHECK(routing_logits.dim() == 2, "routing_logits must be 2D.");
+    TORCH_CHECK(routing_logits.size(1) == num_experts, "routing_logits.size(1) must be num_experts.");
+    if (routing_bias.has_value())
+    {
+        TORCH_CHECK(routing_bias->dim() == 1, "routing_bias must be 1D.");
+        TORCH_CHECK(routing_bias->size(0) == num_experts, "routing_bias.size(0) must be num_experts.");
+    }
+    return moe_topk_sort_impl(routing_logits, routing_bias, std::nullopt, std::nullopt, num_experts, top_k, n_group,
+        topk_group, local_expert_offset, local_num_experts, routed_scaling_factor, tile_tokens_dim,
+        static_cast<RoutingMethodType>(routing_method_type));
+}
+
+std::vector<torch::Tensor> moe_sort(torch::Tensor const& token_selected_experts,
+    torch::Tensor const& token_final_scales, int64_t const num_experts, int64_t const top_k,
+    int64_t const local_expert_offset, int64_t const local_num_experts, int64_t const tile_tokens_dim)
+{
+    TORCH_CHECK(token_selected_experts.dim() == 2, "token_selected_experts must be 2D.");
+    int64_t const num_tokens = token_selected_experts.size(0);
+    TORCH_CHECK(token_selected_experts.size(1) == top_k, "token_selected_experts.size(1) must be top_k.");
+    TORCH_CHECK(token_final_scales.dim() == 2, "token_final_scales must be 2D.");
+    TORCH_CHECK(token_final_scales.size(0) == num_tokens, "token_final_scales.size(0) must be num_tokens.");
+    TORCH_CHECK(token_final_scales.size(1) == top_k, "token_final_scales.size(1) must be top_k.");
+    return moe_topk_sort_impl(std::nullopt, std::nullopt, token_selected_experts, token_final_scales, num_experts,
+        top_k, std::nullopt, std::nullopt, local_expert_offset, local_num_experts, std::nullopt, tile_tokens_dim,
+        RoutingMethodType::Renormalize);
+}
+
+// Permute
+
+std::tuple<torch::Tensor, torch::optional<torch::Tensor>> moe_permute(torch::Tensor const& input,
+    torch::optional<torch::Tensor> const& input_sf, torch::Tensor const& tile_idx_to_mn_limit,
+    torch::Tensor const& permuted_idx_to_expanded_idx, torch::Tensor const& num_non_exiting_tiles,
+    int64_t const tile_tokens_dim, int64_t const top_k)
+{
+    TORCH_CHECK(input.dim() == 2, "input must be 2D.");
+    int64_t const num_tokens = input.size(0);
+    int64_t const hidden_size = input.scalar_type() == torch::kFloat4_e2m1fn_x2 ? input.size(1) * 2 : input.size(1);
+
+    TORCH_CHECK(tile_idx_to_mn_limit.dim() == 1, "tile_idx_to_mn_limit must be 1D.");
+    TORCH_CHECK(tile_idx_to_mn_limit.scalar_type() == torch::kInt32, "tile_idx_to_mn_limit must be int32.");
+    int64_t const num_tiles = tile_idx_to_mn_limit.size(0);
+    TORCH_CHECK(permuted_idx_to_expanded_idx.dim() == 1, "permuted_idx_to_expanded_idx must be 1D.");
+    int64_t const max_num_permuted_tokens = permuted_idx_to_expanded_idx.size(0);
+    TORCH_CHECK(max_num_permuted_tokens == tile_tokens_dim * num_tiles,
+        "max_num_permuted_tokens must be equal to tile_tokens_dim * num_tiles.");
+    TORCH_CHECK(max_num_permuted_tokens >= num_tokens * top_k,
+        "max_num_permuted_tokens must be greater than or equal to num_tokens * top_k.");
+
+    TORCH_CHECK(num_non_exiting_tiles.numel() == 1, "num_non_exiting_tiles must have 1 element.");
+    TORCH_CHECK(num_non_exiting_tiles.scalar_type() == torch::kInt32, "num_non_exiting_tiles must be int32.");
+
+    auto permuted_output = torch::empty(
+        {max_num_permuted_tokens, input.size(1)}, torch::dtype(input.scalar_type()).device(torch::kCUDA));
+
+    void* input_sf_ptr = nullptr;
+    void* permuted_sf_ptr = nullptr;
+    torch::optional<torch::Tensor> permuted_sf;
+    if (input.scalar_type() == torch::kFloat4_e2m1fn_x2)
+    {
+        TORCH_CHECK(input_sf.has_value(), "input_sf is required for NVFP4.");
+        input_sf_ptr = input_sf->data_ptr();
+        int64_t constexpr kSFVecSize = 16;
+        permuted_sf = torch::empty({max_num_permuted_tokens * hidden_size / kSFVecSize},
+            torch::dtype(input_sf->scalar_type()).device(torch::kCUDA));
+        permuted_sf_ptr = permuted_sf->data_ptr();
+    }
+
+    auto const& stream = at::cuda::getCurrentCUDAStream(input.get_device());
+
+#define DISPATCH_MOE_PERMUTE(InputType, SFType)                                                                        \
+    tensorrt_llm::kernels::cute_dsl::moePermute<InputType, SFType>(static_cast<InputType*>(input.data_ptr()),          \
+        static_cast<InputType*>(permuted_output.data_ptr()), static_cast<SFType*>(input_sf_ptr),                       \
+        static_cast<SFType*>(permuted_sf_ptr), tile_idx_to_mn_limit.data_ptr<int32_t>(),                               \
+        permuted_idx_to_expanded_idx.data_ptr<int32_t>(), num_non_exiting_tiles.data_ptr<int32_t>(),                   \
+        max_num_permuted_tokens, hidden_size, top_k, tile_tokens_dim, stream)
+
+    if (input.scalar_type() == torch::kHalf)
+    {
+        DISPATCH_MOE_PERMUTE(half, uint8_t);
+    }
+    else if (input.scalar_type() == torch::kBFloat16)
+    {
+        DISPATCH_MOE_PERMUTE(__nv_bfloat16, uint8_t);
+    }
+    else if (input.scalar_type() == torch::kFloat8_e4m3fn)
+    {
+        DISPATCH_MOE_PERMUTE(__nv_fp8_e4m3, uint8_t);
+    }
+    else if (input.scalar_type() == torch::kFloat4_e2m1fn_x2)
+    {
+        DISPATCH_MOE_PERMUTE(__nv_fp4_e2m1, uint8_t);
+    }
+    else
+    {
+        TORCH_CHECK(false, "Unsupported input dtype: ", input.scalar_type());
+    }
+
+#undef DISPATCH_MOE_PERMUTE
+
+    return {permuted_output, permuted_sf};
+}
+
+// Unpermute
+
+torch::Tensor moe_unpermute(torch::Tensor const& permuted_input, torch::Tensor const& expanded_idx_to_permuted_idx,
+    torch::Tensor const& topk_scales)
+{
+    TORCH_CHECK(permuted_input.dim() == 2, "permuted_input must be 2D.");
+    int64_t const max_num_permuted_tokens = permuted_input.size(0);
+    int64_t const hidden_size = permuted_input.size(1);
+    TORCH_CHECK(expanded_idx_to_permuted_idx.dim() == 2, "expanded_idx_to_permuted_idx must be 2D.");
+    int64_t const num_tokens = expanded_idx_to_permuted_idx.size(0);
+    int64_t const top_k = expanded_idx_to_permuted_idx.size(1);
+    TORCH_CHECK(topk_scales.dim() == 2, "topk_scales must be 2D.");
+    TORCH_CHECK(topk_scales.size(0) == num_tokens, "topk_scales.size(0) must be num_tokens.");
+    TORCH_CHECK(topk_scales.size(1) == top_k, "topk_scales.size(1) must be top_k.");
+
+    TORCH_CHECK(max_num_permuted_tokens >= num_tokens * top_k,
+        "max_num_permuted_tokens must be greater than or equal to num_tokens * top_k.");
+
+    auto output
+        = torch::empty({num_tokens, hidden_size}, torch::dtype(permuted_input.scalar_type()).device(torch::kCUDA));
+    auto const& stream = at::cuda::getCurrentCUDAStream(permuted_input.get_device());
+
+#define DISPATCH_MOE_UNPERMUTE(InputType, TopKScaleType)                                                               \
+    tensorrt_llm::kernels::cute_dsl::moeUnpermute<InputType>(static_cast<InputType*>(permuted_input.data_ptr()),       \
+        static_cast<InputType*>(output.data_ptr()), expanded_idx_to_permuted_idx.data_ptr<int32_t>(),                  \
+        static_cast<TopKScaleType*>(topk_scales.data_ptr()), num_tokens, hidden_size, top_k, stream)
+
+    if (permuted_input.scalar_type() == torch::kHalf && topk_scales.scalar_type() == torch::kFloat)
+    {
+        DISPATCH_MOE_UNPERMUTE(half, float);
+    }
+    else if (permuted_input.scalar_type() == torch::kHalf && topk_scales.scalar_type() == torch::kHalf)
+    {
+        DISPATCH_MOE_UNPERMUTE(half, half);
+    }
+    else if (permuted_input.scalar_type() == torch::kBFloat16 && topk_scales.scalar_type() == torch::kFloat)
+    {
+        DISPATCH_MOE_UNPERMUTE(__nv_bfloat16, float);
+    }
+    else if (permuted_input.scalar_type() == torch::kBFloat16 && topk_scales.scalar_type() == torch::kBFloat16)
+    {
+        DISPATCH_MOE_UNPERMUTE(__nv_bfloat16, __nv_bfloat16);
+    }
+    else
+    {
+        TORCH_CHECK(false, "Unsupported input dtype: ", permuted_input.scalar_type(),
+            " and/or topk_scales dtype: ", topk_scales.scalar_type());
+    }
+
+#undef DISPATCH_MOE_UNPERMUTE
+
+    return output;
+}
+
+// Activation
+
+torch::Tensor moe_swiglu(torch::Tensor const& input, torch::Tensor const& tile_idx_to_mn_limit,
+    torch::Tensor const& num_non_exiting_tiles, int64_t const tile_tokens_dim)
+{
+    TORCH_CHECK(input.dim() == 2, "input must be 2D.");
+    TORCH_CHECK(input.size(1) % 2 == 0, "input.size(1) must be even.");
+    int64_t const max_num_permuted_tokens = input.size(0);
+    int64_t const interm_size = input.size(1) / 2;
+
+    TORCH_CHECK(tile_idx_to_mn_limit.dim() == 1, "tile_idx_to_mn_limit must be 1D.");
+    TORCH_CHECK(tile_idx_to_mn_limit.scalar_type() == torch::kInt32, "tile_idx_to_mn_limit must be int32.");
+    int64_t const num_tiles = tile_idx_to_mn_limit.size(0);
+    TORCH_CHECK(max_num_permuted_tokens == tile_tokens_dim * num_tiles,
+        "max_num_permuted_tokens must be equal to tile_tokens_dim * num_tiles.");
+
+    TORCH_CHECK(num_non_exiting_tiles.numel() == 1, "num_non_exiting_tiles must have 1 element.");
+    TORCH_CHECK(num_non_exiting_tiles.scalar_type() == torch::kInt32, "num_non_exiting_tiles must be int32.");
+
+    auto output
+        = torch::empty({max_num_permuted_tokens, interm_size}, torch::dtype(input.scalar_type()).device(torch::kCUDA));
+    tensorrt_llm::kernels::cutlass_kernels::ActivationParams activation_params{
+        tensorrt_llm::kernels::cutlass_kernels::ActivationType::Swiglu};
+
+    auto const& stream = at::cuda::getCurrentCUDAStream(input.get_device());
+
+#define DISPATCH_MOE_ACTIVATION(InputType, OutputType, SFType)                                                         \
+    tensorrt_llm::kernels::cute_dsl::moeActivation<InputType, OutputType, SFType>(                                     \
+        static_cast<InputType*>(input.data_ptr()), static_cast<OutputType*>(output.data_ptr()), nullptr, nullptr,      \
+        tile_idx_to_mn_limit.data_ptr<int32_t>(), num_non_exiting_tiles.data_ptr<int32_t>(), activation_params,        \
+        max_num_permuted_tokens, interm_size, tile_tokens_dim, stream)
+
+    if (input.scalar_type() == torch::kHalf)
+    {
+        DISPATCH_MOE_ACTIVATION(half, half, uint8_t);
+    }
+    else if (input.scalar_type() == torch::kBFloat16)
+    {
+        DISPATCH_MOE_ACTIVATION(__nv_bfloat16, __nv_bfloat16, uint8_t);
+    }
+    else
+    {
+        TORCH_CHECK(false, "Unsupported input dtype: ", input.scalar_type());
+    }
+
+#undef DISPATCH_MOE_ACTIVATION
+
+    return output;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> moe_swiglu_nvfp4_quantize(torch::Tensor const& input,
+    torch::Tensor const& global_sf, torch::Tensor const& tile_idx_to_mn_limit,
+    torch::Tensor const& num_non_exiting_tiles, int64_t const tile_tokens_dim)
+{
+    TORCH_CHECK(input.dim() == 2, "input must be 2D.");
+    TORCH_CHECK(input.size(1) % 2 == 0, "input.size(1) must be even.");
+    int64_t const max_num_permuted_tokens = input.size(0);
+    int64_t const interm_size = input.size(1) / 2;
+
+    TORCH_CHECK(tile_idx_to_mn_limit.dim() == 1, "tile_idx_to_mn_limit must be 1D.");
+    TORCH_CHECK(tile_idx_to_mn_limit.scalar_type() == torch::kInt32, "tile_idx_to_mn_limit must be int32.");
+    int64_t const num_tiles = tile_idx_to_mn_limit.size(0);
+    TORCH_CHECK(max_num_permuted_tokens == tile_tokens_dim * num_tiles,
+        "max_num_permuted_tokens must be equal to tile_tokens_dim * num_tiles.");
+
+    TORCH_CHECK(global_sf.numel() == 1, "global_sf must have 1 element.");
+    TORCH_CHECK(global_sf.scalar_type() == torch::kFloat32, "global_sf must be float32.");
+    TORCH_CHECK(num_non_exiting_tiles.numel() == 1, "num_non_exiting_tiles must have 1 element.");
+    TORCH_CHECK(num_non_exiting_tiles.scalar_type() == torch::kInt32, "num_non_exiting_tiles must be int32.");
+
+    auto output = torch::empty(
+        {max_num_permuted_tokens, interm_size / 2}, torch::dtype(torch::kFloat4_e2m1fn_x2).device(torch::kCUDA));
+    int64_t constexpr kSFVecSize = 16;
+    auto output_sf = torch::empty(
+        {max_num_permuted_tokens * interm_size / kSFVecSize}, torch::dtype(torch::kUInt8).device(torch::kCUDA));
+
+    tensorrt_llm::kernels::cutlass_kernels::ActivationParams activation_params{
+        tensorrt_llm::kernels::cutlass_kernels::ActivationType::Swiglu};
+
+    auto const& stream = at::cuda::getCurrentCUDAStream(input.get_device());
+
+#define DISPATCH_MOE_ACTIVATION(InputType, OutputType, SFType)                                                         \
+    tensorrt_llm::kernels::cute_dsl::moeActivation<InputType, OutputType, SFType>(                                     \
+        static_cast<InputType*>(input.data_ptr()), static_cast<OutputType*>(output.data_ptr()),                        \
+        global_sf.data_ptr<float>(), static_cast<SFType*>(output_sf.data_ptr()),                                       \
+        tile_idx_to_mn_limit.data_ptr<int32_t>(), num_non_exiting_tiles.data_ptr<int32_t>(), activation_params,        \
+        max_num_permuted_tokens, interm_size, tile_tokens_dim, stream)
+
+    if (input.scalar_type() == torch::kHalf)
+    {
+        DISPATCH_MOE_ACTIVATION(half, __nv_fp4_e2m1, uint8_t);
+    }
+    else if (input.scalar_type() == torch::kBFloat16)
+    {
+        DISPATCH_MOE_ACTIVATION(__nv_bfloat16, __nv_fp4_e2m1, uint8_t);
+    }
+    else
+    {
+        TORCH_CHECK(false, "Unsupported input dtype: ", input.scalar_type());
+    }
+
+#undef DISPATCH_MOE_ACTIVATION
+
+    return {output, output_sf};
+}
+
+torch::Tensor moe_gelu(torch::Tensor const& input, torch::Tensor const& tile_idx_to_mn_limit,
+    torch::Tensor const& num_non_exiting_tiles, int64_t const tile_tokens_dim)
+{
+    TORCH_CHECK(input.dim() == 2, "input must be 2D.");
+    int64_t const max_num_permuted_tokens = input.size(0);
+    int64_t const interm_size = input.size(1);
+
+    TORCH_CHECK(tile_idx_to_mn_limit.dim() == 1, "tile_idx_to_mn_limit must be 1D.");
+    TORCH_CHECK(tile_idx_to_mn_limit.scalar_type() == torch::kInt32, "tile_idx_to_mn_limit must be int32.");
+    int64_t const num_tiles = tile_idx_to_mn_limit.size(0);
+    TORCH_CHECK(max_num_permuted_tokens == tile_tokens_dim * num_tiles,
+        "max_num_permuted_tokens must be equal to tile_tokens_dim * num_tiles.");
+
+    TORCH_CHECK(num_non_exiting_tiles.numel() == 1, "num_non_exiting_tiles must have 1 element.");
+    TORCH_CHECK(num_non_exiting_tiles.scalar_type() == torch::kInt32, "num_non_exiting_tiles must be int32.");
+
+    auto output
+        = torch::empty({max_num_permuted_tokens, interm_size}, torch::dtype(input.scalar_type()).device(torch::kCUDA));
+    tensorrt_llm::kernels::cutlass_kernels::ActivationParams activation_params{
+        tensorrt_llm::kernels::cutlass_kernels::ActivationType::Gelu};
+
+    auto const& stream = at::cuda::getCurrentCUDAStream(input.get_device());
+
+#define DISPATCH_MOE_ACTIVATION(InputType, OutputType, SFType)                                                         \
+    tensorrt_llm::kernels::cute_dsl::moeActivation<InputType, OutputType, SFType>(                                     \
+        static_cast<InputType*>(input.data_ptr()), static_cast<OutputType*>(output.data_ptr()), nullptr, nullptr,      \
+        tile_idx_to_mn_limit.data_ptr<int32_t>(), num_non_exiting_tiles.data_ptr<int32_t>(), activation_params,        \
+        max_num_permuted_tokens, interm_size, tile_tokens_dim, stream)
+
+    if (input.scalar_type() == torch::kHalf)
+    {
+        DISPATCH_MOE_ACTIVATION(half, half, uint8_t);
+    }
+    else if (input.scalar_type() == torch::kBFloat16)
+    {
+        DISPATCH_MOE_ACTIVATION(__nv_bfloat16, __nv_bfloat16, uint8_t);
+    }
+    else
+    {
+        TORCH_CHECK(false, "Unsupported input dtype: ", input.scalar_type());
+    }
+
+#undef DISPATCH_MOE_ACTIVATION
+
+    return output;
+}
+
+} // namespace torch_ext
+
+TORCH_LIBRARY_FRAGMENT(trtllm, m)
+{
+    m.def(
+        "moe_topk_sort(Tensor routing_logits, Tensor? routing_bias, int num_experts, int top_k, int? n_group, "
+        "int? topk_group, int local_expert_offset, int local_num_experts, float? routed_scaling_factor, int "
+        "tile_tokens_dim, int routing_method_type) -> Tensor[]");
+    m.def(
+        "moe_sort(Tensor token_selected_experts, Tensor token_final_scales, int num_experts, int top_k, "
+        "int local_expert_offset, int local_num_experts, int tile_tokens_dim) -> Tensor[]");
+    m.def(
+        "moe_permute(Tensor input, Tensor? input_sf, Tensor tile_idx_to_mn_limit, Tensor permuted_idx_to_expanded_idx, "
+        "Tensor num_non_exiting_tiles, int tile_tokens_dim, int top_k) -> (Tensor, Tensor?)");
+    m.def("moe_unpermute(Tensor permuted_input, Tensor expanded_idx_to_permuted_idx, Tensor topk_scales) -> Tensor");
+    m.def(
+        "moe_swiglu(Tensor input, Tensor tile_idx_to_mn_limit, Tensor num_non_exiting_tiles, "
+        "int tile_tokens_dim) -> Tensor");
+    m.def(
+        "moe_swiglu_nvfp4_quantize(Tensor input, Tensor global_sf, Tensor tile_idx_to_mn_limit, Tensor "
+        "num_non_exiting_tiles, int tile_tokens_dim) -> (Tensor, Tensor)");
+    m.def(
+        "moe_gelu(Tensor input, Tensor tile_idx_to_mn_limit, Tensor num_non_exiting_tiles, "
+        "int tile_tokens_dim) -> Tensor");
+}
+
+TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
+{
+    m.impl("moe_topk_sort", &torch_ext::moe_topk_sort);
+    m.impl("moe_sort", &torch_ext::moe_sort);
+    m.impl("moe_permute", &torch_ext::moe_permute);
+    m.impl("moe_unpermute", &torch_ext::moe_unpermute);
+    m.impl("moe_swiglu", &torch_ext::moe_swiglu);
+    m.impl("moe_swiglu_nvfp4_quantize", &torch_ext::moe_swiglu_nvfp4_quantize);
+    m.impl("moe_gelu", &torch_ext::moe_gelu);
+}
diff --git a/cpp/tensorrt_llm/thop/fp4Quantize.cpp b/cpp/tensorrt_llm/thop/fp4Quantize.cpp
index 7fb66047ea2..a4d9b038bfb 100644
--- a/cpp/tensorrt_llm/thop/fp4Quantize.cpp
+++ b/cpp/tensorrt_llm/thop/fp4Quantize.cpp
@@ -236,8 +236,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m)
 {
     m.def(
         "fp4_quantize(Tensor input, Tensor? globalScale, int sfVecSize, bool sfUseUE8M0=False, bool "
-        "isSfSwizzledLayout=True) "
-        "-> (Tensor, Tensor)");
+        "isSfSwizzledLayout=True) -> (Tensor, Tensor)");
     m.def("calculate_nvfp4_global_scale(Tensor input, Tensor? tokensPerBatch) -> Tensor");
 }
 
diff --git a/cpp/tensorrt_llm/thop/moeUtilOp.cpp b/cpp/tensorrt_llm/thop/moeUtilOp.cpp
index 9035ac1ae5f..cd1f3270662 100644
--- a/cpp/tensorrt_llm/thop/moeUtilOp.cpp
+++ b/cpp/tensorrt_llm/thop/moeUtilOp.cpp
@@ -341,10 +341,8 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m)
         "moe_finalize_scale_op(Tensor gemm2_output, Tensor? biases, Tensor unpermuted_final_scales, Tensor "
         "unpermuted_row_to_permuted_row, Tensor permuted_row_to_unpermuted_row, Tensor token_selected_experts, Tensor "
         "expert_first_token_offset_tensor, bool enable_alltoall, SymInt num_rows, SymInt hidden_size, SymInt "
-        "unpadded_hidden_size, int "
-        "experts_per_token, int "
-        "num_experts_per_node, int tp_size, int tp_rank, int ep_size, int ep_rank)"
-        "-> (Tensor)");
+        "unpadded_hidden_size, int experts_per_token, int num_experts_per_node, int tp_size, int tp_rank, int ep_size, "
+        "int ep_rank) -> (Tensor)");
 }
 
 TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
diff --git a/requirements.txt b/requirements.txt
index c110385811d..cdef165cfa0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -68,7 +68,7 @@ triton==3.5.0; platform_machine == "x86_64"
 tiktoken
 blobfile
 openai-harmony==0.0.4
-nvidia-cutlass-dsl==4.2.1; python_version >= "3.10"
+nvidia-cutlass-dsl==4.3.0.dev0; python_version >= "3.10"
 plotly
 numexpr<2.14.0 # WAR for attempted use of nonexistent numpy.typing
 partial_json_parser
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
index 01f83b08096..0accbf9937b 100644
--- a/tensorrt_llm/_torch/autotuner.py
+++ b/tensorrt_llm/_torch/autotuner.py
@@ -932,9 +932,9 @@ def _optimization_profiles(
         dynamic_dims = []
 
         for spec in tuning_config.dynamic_tensor_specs:
-            assert inspect.isfunction(spec.gen_tuning_buckets) or isinstance(spec.gen_tuning_buckets, (list, tuple)), \
+            assert callable(spec.gen_tuning_buckets) or isinstance(spec.gen_tuning_buckets, (list, tuple)), \
                 "The given dynamic dimension must provide a opt value generation function or a list of opt values"
-            if inspect.isfunction(spec.gen_tuning_buckets):
+            if callable(spec.gen_tuning_buckets):
                 if tuning_config.tune_max_num_tokens is None:
                     # Use the current input size as the opt value
                     opt_shapes = spec.gen_tuning_buckets(
@@ -1067,7 +1067,11 @@ def _create_tensor_like(self, origin_tensor: torch.Tensor,
         # One solution is to manituplate the tensor content to make it more like the real data
         # during the tuning process. This can by controlled in the preparation phase by the runner.
         # It must not use all zero tensors. Otherwise the timing results become unreliable.
-        return torch.randint(-5, 5, shapes, device=device).to(dtype)
+        if dtype == torch.float4_e2m1fn_x2:
+            return torch.randint(-5, 5, shapes,
+                                 device=device).to(torch.uint8).view(dtype)
+        else:
+            return torch.randint(-5, 5, shapes, device=device).to(dtype)
 
     def _prepare_input_tensors(
             self, profile: OptimizationProfile,
diff --git a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py
index 072e3aeb620..16319af3b75 100644
--- a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py
@@ -1,10 +1,11 @@
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import torch
 
 import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils
 
 from ..._utils import get_sm_version
+from .cute_dsl_custom_ops import GroupedGemmInputsHelper
 
 
 def _register_fake():
@@ -485,6 +486,183 @@ def _(
         return gemm2_output.new_empty((num_rows_val, unpadded_hidden_size_val),
                                       dtype=gemm2_output.dtype)
 
+    @torch.library.register_fake("trtllm::moe_topk_sort")
+    def _(
+        routing_logits: torch.Tensor,
+        routing_bias: Optional[torch.Tensor],
+        num_experts: int,
+        top_k: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
+        local_expert_offset: int,
+        local_num_experts: int,
+        routed_scaling_factor: Optional[float],
+        tile_tokens_dim: int,
+        routing_method_type: int,
+    ) -> List[torch.Tensor]:
+        helper = GroupedGemmInputsHelper(
+            num_experts=num_experts,
+            top_k=top_k,
+            num_local_experts=local_num_experts,
+            local_expert_offset=local_expert_offset,
+            tile_size=tile_tokens_dim,
+        )
+        num_tokens = routing_logits.size(0)
+        device = routing_logits.device
+        routing_bias_dtype = torch.bfloat16 if routing_bias is None else routing_bias.dtype
+        max_num_tiles = helper.get_max_num_tiles(num_tokens)
+        max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens)
+        tile_idx_to_expert_idx = torch.empty((max_num_tiles, ),
+                                             dtype=torch.int32,
+                                             device=device)
+        tile_idx_to_mn_limit = torch.empty((max_num_tiles, ),
+                                           dtype=torch.int32,
+                                           device=device)
+        expanded_idx_to_permuted_idx = torch.empty((num_tokens, top_k),
+                                                   dtype=torch.int32,
+                                                   device=device)
+        permuted_idx_to_expanded_idx = torch.empty((max_num_permuted_tokens, ),
+                                                   dtype=torch.int32,
+                                                   device=device)
+        total_num_padded_tokens = torch.empty((1, ),
+                                              dtype=torch.int32,
+                                              device=device)
+        num_non_exiting_tiles = torch.empty((1, ),
+                                            dtype=torch.int32,
+                                            device=device)
+        new_token_final_scales = torch.empty((num_tokens, top_k),
+                                             dtype=routing_bias_dtype,
+                                             device=device)
+        return [
+            tile_idx_to_expert_idx, tile_idx_to_mn_limit,
+            expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx,
+            total_num_padded_tokens, num_non_exiting_tiles,
+            new_token_final_scales
+        ]
+
+    @torch.library.register_fake("trtllm::moe_sort")
+    def _(
+        token_selected_experts: torch.Tensor,
+        token_final_scales: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        local_expert_offset: int,
+        local_num_experts: int,
+        tile_tokens_dim: int,
+    ) -> List[torch.Tensor]:
+        helper = GroupedGemmInputsHelper(
+            num_experts=num_experts,
+            top_k=top_k,
+            num_local_experts=local_num_experts,
+            local_expert_offset=local_expert_offset,
+            tile_size=tile_tokens_dim,
+        )
+        num_tokens = token_selected_experts.size(0)
+        device = token_selected_experts.device
+        max_num_tiles = helper.get_max_num_tiles(num_tokens)
+        max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens)
+        tile_idx_to_expert_idx = torch.empty((max_num_tiles, ),
+                                             dtype=torch.int32,
+                                             device=device)
+        tile_idx_to_mn_limit = torch.empty((max_num_tiles, ),
+                                           dtype=torch.int32,
+                                           device=device)
+        expanded_idx_to_permuted_idx = torch.empty((num_tokens, top_k),
+                                                   dtype=torch.int32,
+                                                   device=device)
+        permuted_idx_to_expanded_idx = torch.empty((max_num_permuted_tokens, ),
+                                                   dtype=torch.int32,
+                                                   device=device)
+        total_num_padded_tokens = torch.empty((1, ),
+                                              dtype=torch.int32,
+                                              device=device)
+        num_non_exiting_tiles = torch.empty((1, ),
+                                            dtype=torch.int32,
+                                            device=device)
+        return [
+            tile_idx_to_expert_idx, tile_idx_to_mn_limit,
+            expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx,
+            total_num_padded_tokens, num_non_exiting_tiles
+        ]
+
+    @torch.library.register_fake("trtllm::moe_permute")
+    def _(
+        input: torch.Tensor,
+        input_sf: Optional[torch.Tensor],
+        tile_idx_to_mn_limit: torch.Tensor,
+        permuted_idx_to_expanded_idx: torch.Tensor,
+        num_non_exiting_tiles: torch.Tensor,
+        tile_tokens_dim: int,
+        top_k: int,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        max_num_permuted_tokens = permuted_idx_to_expanded_idx.size(0)
+        permuted_output = torch.empty((max_num_permuted_tokens, input.size(1)),
+                                      dtype=input.dtype,
+                                      device=input.device)
+        if input.dtype == torch.float4_e2m1fn_x2:
+            hidden_size = input.size(1) * 2
+            sf_vec_size = 16
+            permuted_sf = torch.empty(
+                (max_num_permuted_tokens * hidden_size // sf_vec_size, ),
+                dtype=input_sf.dtype,
+                device=input.device)
+        else:
+            permuted_sf = None
+        return permuted_output, permuted_sf
+
+    @torch.library.register_fake("trtllm::moe_unpermute")
+    def _(
+        permuted_input: torch.Tensor,
+        expanded_idx_to_permuted_idx: torch.Tensor,
+        topk_scales: torch.Tensor,
+    ) -> torch.Tensor:
+        num_tokens = expanded_idx_to_permuted_idx.size(0)
+        output = torch.empty((num_tokens, permuted_input.size(1)),
+                             dtype=permuted_input.dtype,
+                             device=permuted_input.device)
+        return output
+
+    @torch.library.register_fake("trtllm::moe_swiglu")
+    def _(
+        input: torch.Tensor,
+        tile_idx_to_mn_limit: torch.Tensor,
+        num_non_exiting_tiles: torch.Tensor,
+        tile_tokens_dim: int,
+    ) -> torch.Tensor:
+        output = torch.empty((input.size(0), input.size(1) // 2),
+                             dtype=input.dtype,
+                             device=input.device)
+        return output
+
+    @torch.library.register_fake("trtllm::moe_swiglu_nvfp4_quantize")
+    def _(
+        input: torch.Tensor,
+        global_sf: float,
+        tile_idx_to_mn_limit: torch.Tensor,
+        num_non_exiting_tiles: torch.Tensor,
+        tile_tokens_dim: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        max_num_permuted_tokens = input.size(0)
+        interm_size = input.size(1) // 2
+        sf_vec_size = 16
+        output = torch.empty((max_num_permuted_tokens, interm_size // 2),
+                             dtype=torch.float4_e2m1fn_x2,
+                             device=input.device)
+        output_sf = torch.empty(
+            (max_num_permuted_tokens * interm_size // sf_vec_size, ),
+            dtype=torch.uint8,
+            device=input.device)
+        return output, output_sf
+
+    @torch.library.register_fake("trtllm::moe_gelu")
+    def _(
+        input: torch.Tensor,
+        tile_idx_to_mn_limit: torch.Tensor,
+        num_non_exiting_tiles: torch.Tensor,
+        tile_tokens_dim: int,
+    ) -> torch.Tensor:
+        return torch.empty_like(input)
+
     @torch.library.register_fake("trtllm::allgather_list")
     def allgather_list(input_list, sizes, group):
         assert len(input_list) > 0
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
index 0ec96073256..5a660a179ca 100644
--- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -1,11 +1,10 @@
-from typing import List, Tuple
+import itertools
+from typing import List, Optional, Tuple
 
 import torch
-import triton  # type: ignore[import]
-
-from tensorrt_llm._utils import get_sm_version
-from tensorrt_llm.math_utils import pad_up
 
+from ..._utils import get_sm_version
+from ...math_utils import pad_up
 from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
                          OptimizationProfile, TunableRunner, TuningConfig)
 from ..cute_dsl_utils import IS_CUTLASS_DSL_AVAILABLE
@@ -13,20 +12,22 @@
                      get_last_power_of_2_num_tokens_buckets,
                      last_positive_power_of_2)
 
+try:
+    from cuda.bindings import driver as cuda
+except ImportError:
+    from cuda import cuda
+
 if IS_CUTLASS_DSL_AVAILABLE:
 
     import cutlass
     import cutlass.cute as cute
 
-    from tensorrt_llm._torch.cute_dsl_kernels.blackwell.dense_blockscaled_gemm_persistent import (
+    from ..cute_dsl_kernels.blackwell.dense_blockscaled_gemm_persistent import (
         Sm100BlockScaledPersistentDenseGemmKernel,
         Sm100BlockScaledPersistentDenseGemmKernelWrapper)
-    from tensorrt_llm._torch.cute_dsl_kernels.blackwell.utils import make_ptr
-
-    try:
-        from cuda.bindings import driver as cuda
-    except ImportError:
-        from cuda import cuda
+    from ..cute_dsl_kernels.blackwell.grouped_blockscaled_gemm_persistent import \
+        Sm100BlockScaledPersistentGroupedGemmKernel
+    from ..cute_dsl_kernels.blackwell.utils import make_ptr
 
     class CuteDSLNVFP4BlackwellLinear(TunableRunner):
         kernel_dict = dict()
@@ -46,7 +47,7 @@ def __init__(self, alpha: float, output_dtype: torch.dtype):
 
             if get_sm_version() != 100:
                 raise ValueError(
-                    f"SM version {get_sm_version()} is not supported for CuteDSLNVFP4BlackwellLinear, it only supports SM 100"
+                    f"SM version {get_sm_version()} is not supported for {self.__class__.__name__}, it only supports SM 100"
                 )
 
         # rewrite the hash function because the value of self.alpha doesn't affect the tactic.
@@ -209,7 +210,6 @@ def forward(
             torch_stream = torch.cuda.current_stream()
             stream = cuda.CUstream(torch_stream.cuda_stream)
 
-            gemm_wrapper_func = Sm100BlockScaledPersistentDenseGemmKernelWrapper
             CACHE_KEY = (
                 sf_vec_size,
                 mma_tiler_mn,
@@ -236,7 +236,7 @@ def forward(
                 kernel_sf_n = sf_n
 
             if CACHE_KEY not in CuteDSLNVFP4BlackwellLinear.kernel_dict:
-                gemm = gemm_wrapper_func(
+                gemm = Sm100BlockScaledPersistentDenseGemmKernelWrapper(
                     sf_vec_size,
                     mma_tiler_mn,
                     cluster_shape_mn,
@@ -337,3 +337,563 @@ def _(
         # output is fixed as bf16
         ret = mat_a.new_empty(shape, dtype=torch.bfloat16)
         return ret
+
+    class GroupedGemmInputsHelper:
+
+        def __init__(self, num_experts: int, top_k: int, num_local_experts: int,
+                     local_expert_offset: int, tile_size: int):
+            self.num_experts = num_experts
+            self.top_k = top_k
+            self.num_local_experts = num_local_experts
+            self.local_expert_offset = local_expert_offset
+            self.tile_size = tile_size
+
+        def get_max_num_tiles(self, num_tokens: int) -> int:
+            num_expanded_tokens = num_tokens * self.top_k
+            if num_expanded_tokens <= self.num_local_experts:
+                return num_expanded_tokens
+            return (
+                num_expanded_tokens +
+                (self.tile_size - 1) * self.num_local_experts) // self.tile_size
+
+        def get_max_num_permuted_tokens(self, num_tokens: int) -> int:
+            return self.get_max_num_tiles(num_tokens) * self.tile_size
+
+        def infer_num_tokens(self, max_num_permuted_tokens: int) -> int:
+            max_num_tiles = max_num_permuted_tokens // self.tile_size
+            if max_num_tiles >= self.num_local_experts:
+                return (max_num_permuted_tokens - (self.tile_size - 1) *
+                        (self.num_local_experts - 1)) // self.top_k
+            return max_num_tiles // self.top_k
+
+        def gen_tuning_buckets(self, max_num_tokens: int) -> List[int]:
+            buckets = get_last_power_of_2_num_tokens_buckets(
+                self.infer_num_tokens(max_num_tokens))
+            return sorted(
+                list(set(self.get_max_num_permuted_tokens(x) for x in buckets)))
+
+        def map_to_tuning_buckets(self, x: int) -> int:
+            return self.get_max_num_permuted_tokens(
+                last_positive_power_of_2(self.infer_num_tokens(x)))
+
+        def infer_tile_idx_to_group_idx_shape(
+                self, input_shapes: List[torch.Size]) -> int:
+            return input_shapes[0][0] // self.tile_size
+
+        def inputs_pre_hook(self,
+                            inputs: List[torch.Tensor]) -> List[torch.Tensor]:
+            a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, num_non_exiting_tiles = inputs
+            num_tokens = self.infer_num_tokens(a.size(0))
+            average_num_tokens_per_expert = num_tokens * self.top_k / self.num_experts
+            balance = 0
+            tile_idx_to_group_idx_list = []
+            for i in range(self.num_local_experts):
+                balance += average_num_tokens_per_expert
+                if balance <= 1e-3:
+                    continue
+                curr_num_tokens = int(balance) + 1
+                curr_num_tiles = (curr_num_tokens + self.tile_size -
+                                  1) // self.tile_size
+                tile_idx_to_group_idx_list.extend([i] * curr_num_tiles)
+                balance -= curr_num_tokens
+
+            num_non_exiting_tiles_val = len(tile_idx_to_group_idx_list)
+            assert 0 < num_non_exiting_tiles_val <= tile_idx_to_group_idx.size(
+                0)
+
+            tile_idx_to_group_idx_list.extend(
+                [int(-1e9)] *
+                (tile_idx_to_group_idx.size(0) - num_non_exiting_tiles_val))
+            tile_idx_to_group_idx = torch.tensor(
+                tile_idx_to_group_idx_list,
+                dtype=tile_idx_to_group_idx.dtype,
+                device=tile_idx_to_group_idx.device)
+            num_non_exiting_tiles = torch.tensor(
+                [num_non_exiting_tiles_val],
+                dtype=num_non_exiting_tiles.dtype,
+                device=num_non_exiting_tiles.device)
+            return a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, num_non_exiting_tiles
+
+    class Sm100BlockScaledPersistentGroupedGemmRunner(TunableRunner):
+        kernel_cache = dict()
+        tuning_config_cache = dict()
+
+        def __init__(self,
+                     num_experts: int,
+                     top_k: int,
+                     num_local_experts: int,
+                     local_expert_offset: int,
+                     tile_size: int,
+                     output_dtype: torch.dtype,
+                     scaling_vector_size: int = 16):
+            super().__init__()
+            self.num_experts = num_experts
+            self.top_k = top_k
+            self.num_local_experts = num_local_experts
+            self.local_expert_offset = local_expert_offset
+            self.tile_size = tile_size
+
+            assert output_dtype == torch.bfloat16
+            self.output_dtype = output_dtype
+            self.scaling_vector_size = scaling_vector_size
+
+            if get_sm_version() != 100:
+                raise ValueError(
+                    f"SM version {get_sm_version()} is not supported for {self.__class__.__name__}, it only supports SM 100"
+                )
+
+        def get_valid_tactics(
+            self,
+            inputs: List[torch.Tensor],
+            profile: OptimizationProfile,
+            **kwargs,
+        ) -> List[Tuple[int, int]]:
+            a, b, *_ = inputs
+            m, k = a.size(0), a.size(1) * 2
+            l, n = b.size(0), b.size(1)
+
+            # TODO: Add full shmoo
+            mma_tiler_mn_candidates = [(128, 128), (128, 256)]
+            cluster_shape_mn_candidates = [(1, 1), (1, 2)]
+
+            valid_tactics = []
+            for mma_tiler_mn, cluster_shape_mn in itertools.product(
+                    mma_tiler_mn_candidates, cluster_shape_mn_candidates):
+                if Sm100BlockScaledPersistentGroupedGemmKernel.can_implement(
+                        ab_dtype=cutlass.Float4E2M1FN,
+                        sf_dtype=cutlass.Float8E4M3FN,
+                        sf_vec_size=self.scaling_vector_size,
+                        acc_dtype=cutlass.Float32,
+                        c_dtype=cutlass.BFloat16,
+                        use_2cta_instrs=False,
+                        mma_tiler_mn=mma_tiler_mn,
+                        cluster_shape_mn=cluster_shape_mn,
+                        m=m,
+                        n=n,
+                        k=k,
+                        l=l,
+                        a_major="k",
+                        b_major="k",
+                        c_major="n",
+                        m_aligned=self.tile_size,
+                ):
+                    valid_tactics.append((mma_tiler_mn, cluster_shape_mn))
+
+            assert len(valid_tactics) > 0
+            return valid_tactics
+
+        def get_tuning_config(self) -> TuningConfig:
+            key = hash(self)
+            if key not in self.__class__.tuning_config_cache:
+                helper = GroupedGemmInputsHelper(self.num_experts, self.top_k,
+                                                 self.num_local_experts,
+                                                 self.local_expert_offset,
+                                                 self.tile_size)
+                self.__class__.tuning_config_cache[key] = TuningConfig(
+                    dynamic_tensor_specs=(DynamicTensorSpec(
+                        0, 0, helper.gen_tuning_buckets,
+                        helper.map_to_tuning_buckets), ),
+                    constraint_specs=(
+                        ConstraintSpec(2, 0, fp4_scale_infer_shape),
+                        ConstraintSpec(
+                            5, 0, helper.infer_tile_idx_to_group_idx_shape)),
+                    inputs_pre_hook=helper.inputs_pre_hook,
+                )
+            return self.__class__.tuning_config_cache[key]
+
+        def forward(self, inputs: List[torch.Tensor],
+                    tactic: Optional[tuple]) -> torch.Tensor:
+            a, b, a_sf, b_sf, alpha, tile_idx_to_group_idx, num_non_exiting_tiles = inputs
+            assert a.dtype == torch.float4_e2m1fn_x2
+            assert a.dim() == 2
+            assert b.dtype == torch.float4_e2m1fn_x2
+            assert b.dim() == 3
+            assert a_sf.dtype == torch.uint8
+            assert a_sf.dim() == 1
+            assert b_sf.dtype == torch.uint8
+            assert b_sf.dim() == 3
+            assert alpha.dtype == torch.float32
+            assert alpha.dim() == 1
+
+            m, k = a.size(0), a.size(1) * 2
+            l, n = b.size(0), b.size(1)
+            scale_k = k // self.scaling_vector_size
+            assert m % self.tile_size == 0
+            assert k % (self.scaling_vector_size * 4) == 0
+            assert b.size(2) * 2 == k
+            assert a_sf.size(0) == m * scale_k
+            assert b_sf.size(0) == l
+            assert b_sf.size(1) == n
+            assert b_sf.size(2) == scale_k
+            assert alpha.size(0) == l
+
+            num_tiles = m // self.tile_size
+            assert tile_idx_to_group_idx.dtype == torch.int32
+            assert tile_idx_to_group_idx.size() == (num_tiles, )
+            assert num_non_exiting_tiles.dtype == torch.int32
+            assert num_non_exiting_tiles.size() == (1, )
+
+            c = torch.empty(m, n, dtype=self.output_dtype, device=a.device)
+
+            a_ptr = make_ptr(cutlass.Float4E2M1FN,
+                             a.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=32)
+            b_ptr = make_ptr(cutlass.Float4E2M1FN,
+                             b.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=32)
+            a_sf_ptr = make_ptr(cutlass.Float8E4M3FN,
+                                a_sf.data_ptr(),
+                                cute.AddressSpace.gmem,
+                                assumed_align=16)
+            b_sf_ptr = make_ptr(cutlass.Float8E4M3FN,
+                                b_sf.data_ptr(),
+                                cute.AddressSpace.gmem,
+                                assumed_align=16)
+            alpha_ptr = make_ptr(cutlass.Float32, alpha.data_ptr(),
+                                 cute.AddressSpace.gmem)
+            tile_idx_to_group_idx_ptr = make_ptr(
+                cutlass.Int32, tile_idx_to_group_idx.data_ptr(),
+                cute.AddressSpace.gmem)
+            num_non_exiting_tiles_ptr = make_ptr(
+                cutlass.Int32, num_non_exiting_tiles.data_ptr(),
+                cute.AddressSpace.gmem)
+            c_ptr = make_ptr(cutlass.BFloat16,
+                             c.data_ptr(),
+                             cute.AddressSpace.gmem,
+                             assumed_align=16)
+
+            torch_stream = torch.cuda.current_stream()
+            stream = cuda.CUstream(torch_stream.cuda_stream)
+
+            if isinstance(tactic, tuple):
+                mma_tiler_mn, cluster_shape_mn = tactic
+            else:
+                mma_tiler_mn, cluster_shape_mn = (128, 128), (1, 1)
+
+            cache_key = (self.scaling_vector_size, self.tile_size, mma_tiler_mn,
+                         cluster_shape_mn)
+            if cache_key not in self.__class__.kernel_cache:
+                gemm = Sm100BlockScaledPersistentGroupedGemmKernel(
+                    sf_vec_size=self.scaling_vector_size,
+                    acc_dtype=cutlass.Float32,
+                    use_2cta_instrs=False,
+                    mma_tiler_mn=mma_tiler_mn,
+                    cluster_shape_mn=cluster_shape_mn,
+                )
+
+                compiled_gemm = cute.compile(
+                    gemm.wrapper,
+                    a_ptr,
+                    b_ptr,
+                    a_sf_ptr,
+                    b_sf_ptr,
+                    c_ptr,
+                    alpha_ptr,
+                    tile_idx_to_group_idx_ptr,
+                    num_non_exiting_tiles_ptr,
+                    m,
+                    n,
+                    k,
+                    l,
+                    tile_size=self.tile_size,
+                    scaling_vector_size=self.scaling_vector_size,
+                    max_active_clusters=16,
+                    stream=stream,
+                )
+                self.__class__.kernel_cache[cache_key] = compiled_gemm
+            else:
+                compiled_gemm = self.__class__.kernel_cache[cache_key]
+
+            compiled_gemm(
+                a_ptr,
+                b_ptr,
+                a_sf_ptr,
+                b_sf_ptr,
+                c_ptr,
+                alpha_ptr,
+                tile_idx_to_group_idx_ptr,
+                num_non_exiting_tiles_ptr,
+                m,
+                n,
+                k,
+                stream=stream,
+            )
+            return c
+
+    @torch.library.custom_op("trtllm::cute_dsl_nvfp4_grouped_gemm_blackwell",
+                             mutates_args=(),
+                             device_types="cuda")
+    def cute_dsl_nvfp4_grouped_gemm_blackwell(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        input_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+        alpha: torch.Tensor,
+        tile_idx_to_group_idx: torch.Tensor,
+        num_non_exiting_tiles: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_local_experts: int,
+        local_expert_offset: int,
+        tile_size: int,
+        output_dtype: torch.dtype,
+        scaling_vector_size: int = 16,
+    ) -> torch.Tensor:
+        tuner = AutoTuner.get()
+
+        runner = Sm100BlockScaledPersistentGroupedGemmRunner(
+            num_experts, top_k, num_local_experts, local_expert_offset,
+            tile_size, output_dtype, scaling_vector_size)
+        inputs = [
+            input, weight, input_scale, weight_scale, alpha,
+            tile_idx_to_group_idx, num_non_exiting_tiles
+        ]
+
+        _, best_tactic = tuner.choose_one(
+            "trtllm::cute_dsl_nvfp4_grouped_gemm_blackwell",
+            [runner],
+            runner.get_tuning_config(),
+            inputs,
+        )
+        output = runner(inputs, tactic=best_tactic)
+        return output
+
+    @torch.library.register_fake(
+        "trtllm::cute_dsl_nvfp4_grouped_gemm_blackwell")
+    def _(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        input_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+        alpha: torch.Tensor,
+        tile_idx_to_group_idx: torch.Tensor,
+        num_non_exiting_tiles: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_local_experts: int,
+        local_expert_offset: int,
+        tile_size: int,
+        output_dtype: torch.dtype,
+        scaling_vector_size: int = 16,
+    ):
+        m = input.size(0)
+        n = weight.size(1)
+        return torch.empty(m, n, dtype=output_dtype, device=input.device)
+
+    class FusedMoEInputsHelper:
+
+        def __init__(self, num_experts: int, top_k: int, num_local_experts: int,
+                     local_expert_offset: int):
+            self.num_experts = num_experts
+            self.top_k = top_k
+            self.num_local_experts = num_local_experts
+            self.local_expert_offset = local_expert_offset
+
+        def infer_token_selected_experts_shape(
+                self, input_shapes: List[torch.Size]) -> int:
+            return input_shapes[0][0]
+
+        def infer_token_final_scales_shape(
+                self, input_shapes: List[torch.Size]) -> int:
+            return input_shapes[0][0]
+
+        def inputs_pre_hook(self,
+                            inputs: List[torch.Tensor]) -> List[torch.Tensor]:
+            x, x_sf, token_selected_experts, token_final_scales, *others = inputs
+            num_tokens = token_selected_experts.size(0)
+            new_token_final_scales, new_token_selected_experts = torch.randn(
+                num_tokens,
+                self.num_experts,
+                device=token_selected_experts.device).topk(self.top_k, dim=-1)
+            new_token_selected_experts = new_token_selected_experts.to(
+                token_selected_experts.dtype)
+            new_token_final_scales = new_token_final_scales.softmax(dim=-1).to(
+                token_final_scales.dtype)
+            return x, x_sf, new_token_selected_experts, new_token_final_scales, *others
+
+    class Sm100BlockScaledFusedMoERunner(TunableRunner):
+        tuning_config_cache = dict()
+
+        def __init__(self,
+                     num_experts: int,
+                     top_k: int,
+                     num_local_experts: int,
+                     local_expert_offset: int,
+                     output_dtype: torch.dtype,
+                     scaling_vector_size: int = 16):
+            super().__init__()
+            self.num_experts = num_experts
+            self.top_k = top_k
+            self.num_local_experts = num_local_experts
+            self.local_expert_offset = local_expert_offset
+
+            assert output_dtype == torch.bfloat16
+            self.output_dtype = output_dtype
+            self.scaling_vector_size = scaling_vector_size
+
+        def get_valid_tactics(
+            self,
+            inputs: List[torch.Tensor],
+            profile: OptimizationProfile,
+            **kwargs,
+        ) -> List[int]:
+            return [128]
+
+        def get_tuning_config(self) -> TuningConfig:
+            key = hash(self)
+            if key not in self.__class__.tuning_config_cache:
+                helper = FusedMoEInputsHelper(self.num_experts, self.top_k,
+                                              self.num_local_experts,
+                                              self.local_expert_offset)
+                self.__class__.tuning_config_cache[key] = TuningConfig(
+                    dynamic_tensor_specs=(DynamicTensorSpec(
+                        0, 0, get_last_power_of_2_num_tokens_buckets,
+                        last_positive_power_of_2), ),
+                    constraint_specs=(
+                        ConstraintSpec(1, 0, fp4_scale_infer_shape),
+                        ConstraintSpec(
+                            2, 0, helper.infer_token_selected_experts_shape),
+                        ConstraintSpec(3, 0,
+                                       helper.infer_token_final_scales_shape)),
+                    inputs_pre_hook=helper.inputs_pre_hook,
+                )
+            return self.__class__.tuning_config_cache[key]
+
+        def forward(self, inputs: List[torch.Tensor],
+                    tactic: Optional[int]) -> torch.Tensor:
+            if isinstance(tactic, int):
+                tile_size = tactic
+            else:
+                tile_size = 128
+
+            x, x_sf, token_selected_experts, token_final_scales, gemm1_weight, gemm1_weight_scale, gemm1_alpha, gemm2_input_global_scale, gemm2_weight, gemm2_weight_scale, gemm2_alpha = inputs
+            tile_idx_to_expert_idx, tile_idx_to_mn_limit, expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, total_num_padded_tokens, num_non_exiting_tiles = torch.ops.trtllm.moe_sort(
+                token_selected_experts=token_selected_experts,
+                token_final_scales=token_final_scales,
+                num_experts=self.num_experts,
+                top_k=self.top_k,
+                local_expert_offset=self.local_expert_offset,
+                local_num_experts=self.num_local_experts,
+                tile_tokens_dim=tile_size,
+            )
+            x, x_sf = torch.ops.trtllm.moe_permute(
+                input=x,
+                input_sf=x_sf,
+                tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+                permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx,
+                num_non_exiting_tiles=num_non_exiting_tiles,
+                tile_tokens_dim=tile_size,
+                top_k=self.top_k,
+            )
+            x = torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_blackwell(
+                input=x.view(torch.float4_e2m1fn_x2),
+                weight=gemm1_weight.view(torch.float4_e2m1fn_x2),
+                input_scale=x_sf.view(torch.uint8),
+                weight_scale=gemm1_weight_scale.view(torch.uint8),
+                alpha=gemm1_alpha,
+                tile_idx_to_group_idx=tile_idx_to_expert_idx,
+                num_non_exiting_tiles=num_non_exiting_tiles,
+                num_experts=self.num_experts,
+                top_k=self.top_k,
+                num_local_experts=self.num_local_experts,
+                local_expert_offset=self.local_expert_offset,
+                tile_size=tile_size,
+                output_dtype=self.output_dtype,
+            )
+            x, x_sf = torch.ops.trtllm.moe_swiglu_nvfp4_quantize(
+                input=x,
+                global_sf=gemm2_input_global_scale,
+                tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+                num_non_exiting_tiles=num_non_exiting_tiles,
+                tile_tokens_dim=tile_size,
+            )
+            x = torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_blackwell(
+                input=x.view(torch.float4_e2m1fn_x2),
+                weight=gemm2_weight.view(torch.float4_e2m1fn_x2),
+                input_scale=x_sf.view(torch.uint8),
+                weight_scale=gemm2_weight_scale.view(torch.uint8),
+                alpha=gemm2_alpha,
+                tile_idx_to_group_idx=tile_idx_to_expert_idx,
+                num_non_exiting_tiles=num_non_exiting_tiles,
+                num_experts=self.num_experts,
+                top_k=self.top_k,
+                num_local_experts=self.num_local_experts,
+                local_expert_offset=self.local_expert_offset,
+                tile_size=tile_size,
+                output_dtype=self.output_dtype,
+            )
+            x = torch.ops.trtllm.moe_unpermute(
+                permuted_input=x,
+                expanded_idx_to_permuted_idx=expanded_idx_to_permuted_idx,
+                topk_scales=token_final_scales,
+            )
+            return x
+
+    @torch.library.custom_op("trtllm::cute_dsl_nvfp4_fused_moe_blackwell",
+                             mutates_args=(),
+                             device_types="cuda")
+    def cute_dsl_nvfp4_fused_moe_blackwell(
+        input: torch.Tensor,
+        input_scale: torch.Tensor,
+        token_selected_experts: torch.Tensor,
+        token_final_scales: torch.Tensor,
+        gemm1_weight: torch.Tensor,
+        gemm1_weight_scale: torch.Tensor,
+        gemm1_alpha: torch.Tensor,
+        gemm2_input_global_scale: torch.Tensor,
+        gemm2_weight: torch.Tensor,
+        gemm2_weight_scale: torch.Tensor,
+        gemm2_alpha: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_local_experts: int,
+        local_expert_offset: int,
+        output_dtype: torch.dtype,
+        scaling_vector_size: int = 16,
+    ) -> torch.Tensor:
+        tuner = AutoTuner.get()
+        runner = Sm100BlockScaledFusedMoERunner(num_experts, top_k,
+                                                num_local_experts,
+                                                local_expert_offset,
+                                                output_dtype,
+                                                scaling_vector_size)
+        inputs = [
+            input, input_scale, token_selected_experts, token_final_scales,
+            gemm1_weight, gemm1_weight_scale, gemm1_alpha,
+            gemm2_input_global_scale, gemm2_weight, gemm2_weight_scale,
+            gemm2_alpha
+        ]
+
+        _, best_tactic = tuner.choose_one(
+            "trtllm::cute_dsl_nvfp4_fused_moe_blackwell",
+            [runner],
+            runner.get_tuning_config(),
+            inputs,
+        )
+        output = runner(inputs, tactic=best_tactic)
+        return output
+
+    @torch.library.register_fake("trtllm::cute_dsl_nvfp4_fused_moe_blackwell")
+    def _(
+        input: torch.Tensor,
+        input_scale: torch.Tensor,
+        token_selected_experts: torch.Tensor,
+        token_final_scales: torch.Tensor,
+        gemm1_weight: torch.Tensor,
+        gemm1_weight_scale: torch.Tensor,
+        gemm1_alpha: torch.Tensor,
+        gemm2_input_global_scale: torch.Tensor,
+        gemm2_weight: torch.Tensor,
+        gemm2_weight_scale: torch.Tensor,
+        gemm2_alpha: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_local_experts: int,
+        local_expert_offset: int,
+        output_dtype: torch.dtype,
+        scaling_vector_size: int = 16,
+    ):
+        m, k = input.size(0), input.size(1) * 2
+        return torch.empty(m, k, dtype=output_dtype, device=input.device)
diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/grouped_blockscaled_gemm_persistent.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/grouped_blockscaled_gemm_persistent.py
new file mode 100644
index 00000000000..8bd6df6e09b
--- /dev/null
+++ b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/grouped_blockscaled_gemm_persistent.py
@@ -0,0 +1,2285 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from typing import Tuple, Type, Union
+
+import cuda.bindings.driver as cuda
+import cutlass
+import cutlass.cute as cute
+import cutlass.pipeline as pipeline
+import cutlass.utils as utils
+import cutlass.utils.blackwell_helpers as sm100_utils
+import cutlass.utils.blockscaled_layout as blockscaled_utils
+from cutlass.cute.nvgpu import cpasync, tcgen05
+
+
+class Sm100BlockScaledPersistentGroupedGemmKernel:
+    """This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
+    and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
+
+    :param sf_vec_size: Scalefactor vector size.
+    :type sf_vec_size: int
+    :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
+    :type mma_tiler_mn: Tuple[int, int]
+    :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
+    :type cluster_shape_mn: Tuple[int, int]
+
+    :note: In current version, A and B tensor must have the same data type
+        - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
+
+    :note: Supported combinations of A/B data types, SF data typs and SF vector size:
+        - MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
+        - NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
+
+    :note: Supported accumulator data types:
+        - Float32
+
+    :note: Supported C data types:
+        - Float32
+        - Float16/BFloat16
+        - Float8E4M3FN/Float8E5M2
+
+    :note: Constraints:
+        - MMA tiler M must be 128 or 256 (use_2cta_instrs)
+        - MMA tiler N must be 64/128/192/256
+        - Cluster shape M must be multiple of 2 if Mma tiler M is 256
+        - Cluster shape M/N must be positive and power of 2, total cluster size <= 16
+        - Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
+
+    Example:
+        >>> gemm = Sm100BlockScaledPersistentGroupedGemmKernel(
+        ...     sf_vec_size=16, mma_tiler_mn=(256, 128), cluster_shape_mn=(2, 1)
+        ... )
+        >>> gemm(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, max_active_clusters, stream)
+    """
+
+    def __init__(
+        self,
+        sf_vec_size: int,
+        acc_dtype: Type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+    ):
+        """Initializes the configuration for a Blackwell blockscaled dense GEMM kernel.
+
+        This configuration includes several key aspects:
+
+        1.  MMA Instruction Settings (tcgen05):
+            - acc_dtype: Data types for MMA accumulator.
+            - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
+            - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant
+              with cta_group=2 should be used.
+
+        2.  Cluster Shape:
+            - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
+
+        :param acc_dtype: Data type of the accumulator.
+        :type acc_dtype: type[cutlass.Numeric]
+        :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
+        :type mma_tiler_mn: Tuple[int, int]
+        :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant.
+        :type use_2cta_instrs: bool
+        :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
+        :type cluster_shape_mn: Tuple[int, int]
+        """
+
+        self.sf_vec_size = sf_vec_size
+        self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
+        self.use_2cta_instrs = use_2cta_instrs
+        self.cluster_shape_mn = cluster_shape_mn
+        # K dimension is deferred in _setup_attributes
+        self.mma_tiler = (*mma_tiler_mn, 1)
+
+        self.cta_group = tcgen05.CtaGroup.TWO if use_2cta_instrs else tcgen05.CtaGroup.ONE
+
+        self.occupancy = 1
+        self.epilog_warp_id = (0, 1, 2, 3)
+        self.mma_warp_id = 4
+        self.tma_warp_id = 5
+        self.sched_warp_id = 6
+        self.threads_per_warp = 32
+        self.threads_per_cta = self.threads_per_warp * len(
+            (
+                *self.epilog_warp_id,
+                self.mma_warp_id,
+                self.tma_warp_id,
+                self.sched_warp_id,
+            )
+        )
+        self.threads_wo_sched = self.threads_per_warp * len(
+            (
+                *self.epilog_warp_id,
+                self.mma_warp_id,
+                self.tma_warp_id,
+            )
+        )
+        self.num_regs_uniform_warps = 64
+        self.num_regs_sched_warps = 64
+        self.num_regs_epilogue_warps = 216
+
+        # Set barrier for cta sync, epilogue sync and tmem ptr sync
+        self.cta_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=1,
+            num_threads=self.threads_per_cta,
+        )
+        self.epilog_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=2,
+            num_threads=32 * len(self.epilog_warp_id),
+        )
+        self.tmem_alloc_barrier = pipeline.NamedBarrier(
+            barrier_id=3,
+            num_threads=32 * len((self.mma_warp_id, *self.epilog_warp_id)),
+        )
+        self.sched_sync_barrier = pipeline.NamedBarrier(
+            barrier_id=4,
+            num_threads=self.threads_per_warp,
+        )
+        self.num_smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+        # TMEM offset for final accumulator
+        self.tmem_final_offset = 384
+
+    def _setup_attributes(self):
+        """Set up configurations that are dependent on GEMM inputs
+
+        This method configures various attributes based on the input tensor properties
+        (data types, leading dimensions) and kernel settings:
+        - Configuring tiled MMA
+        - Computing MMA/cluster/tile shapes
+        - Computing cluster layout
+        - Computing multicast CTAs for A/B
+        - Computing epilogue subtile
+        - Setting up A/B/C stage counts in shared memory
+        - Computing A/B/C shared memory layout
+        - Computing tensor memory allocation columns
+        """
+
+        self.mma_inst_shape_mn = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+        )
+        # (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
+        self.mma_inst_shape_mn_sfb = (
+            self.mma_inst_shape_mn[0] // (2 if self.use_2cta_instrs else 1),
+            cute.round_up(self.mma_inst_shape_mn[1], 128),
+        )
+
+        # Configure tiled mma
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+
+        # Compute mma/cluster/tile shapes
+        mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+        mma_inst_tile_k = 4
+        self.mma_tiler = (
+            self.mma_tiler[0],
+            self.mma_tiler[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.mma_tiler_sfb = (
+            self.mma_inst_shape_mn_sfb[0],
+            self.mma_inst_shape_mn_sfb[1],
+            mma_inst_shape_k * mma_inst_tile_k,
+        )
+
+        self.cta_tile_shape_mnk = (
+            self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+            self.mma_tiler[1],
+            self.mma_tiler[2],
+        )
+
+        # Compute cluster layout
+        self.cluster_layout_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma.thr_id.shape,),
+        )
+
+        self.cluster_layout_sfb_vmnk = cute.tiled_divide(
+            cute.make_layout((*self.cluster_shape_mn, 1)),
+            (tiled_mma_sfb.thr_id.shape,),
+        )
+
+        # Compute number of multicast CTAs for A/B
+        self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
+        self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
+        self.is_a_mcast = self.num_mcast_ctas_a > 1
+        self.is_b_mcast = self.num_mcast_ctas_b > 1
+
+        # Compute epilogue subtile
+        self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
+            self.cta_tile_shape_mnk,
+            self.use_2cta_instrs,
+            self.c_layout,
+            self.c_dtype,
+        )
+
+        # Setup A/B/C/Scale stage count in shared memory and ACC stage count in tensor memory
+        (
+            self.num_acc_stage,
+            self.num_ab_stage,
+            self.num_c_stage,
+            self.num_tile_stage,
+        ) = self._compute_stages(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.b_dtype,
+            self.epi_tile,
+            self.c_dtype,
+            self.c_layout,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.num_smem_capacity,
+            self.occupancy,
+        )
+
+        # Compute A/B/C/Scale shared memory layout
+        self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            self.mma_tiler,
+            self.a_dtype,
+            self.num_ab_stage,
+        )
+        self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            self.mma_tiler,
+            self.b_dtype,
+            self.num_ab_stage,
+        )
+        self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+        self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            self.mma_tiler,
+            self.sf_vec_size,
+            self.num_ab_stage,
+        )
+
+        self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
+            self.c_dtype,
+            self.c_layout,
+            self.epi_tile,
+            self.num_c_stage,
+        )
+
+        # Compute the number of tensor memory allocation columns
+        self.num_tmem_alloc_cols = 512
+
+    @cute.jit
+    def __call__(
+        self,
+        a: cute.Tensor,
+        b: cute.Tensor,
+        c: cute.Tensor,
+        sfa: cute.Tensor,
+        sfb: cute.Tensor,
+        tile_idx_to_group_idx: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        alpha: cute.Tensor,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        """Execute the GEMM operation in steps:
+        - Setup static attributes before smem/grid/tma computation
+        - Setup TMA load/store atoms and tensors
+        - Compute grid size with regard to hardware constraints
+        - Define shared storage for kernel
+        - Launch the kernel synchronously
+
+        :param a: Input tensor A
+        :type a: cute.Tensor
+        :param b: Input tensor B
+        :type b: cute.Tensor
+        :param c: Output tensor C
+        :type c: cute.Tensor
+        :param sfa: Scale factor tensor A
+        :type sfa: cute.Tensor
+        :param sfb: Scale factor tensor B
+        :type sfb: cute.Tensor
+        :param tile_idx_to_group_idx: Mapping from tile index to group ID, shape (permuted_m/cta_tile_m,) where
+        cta_tile_m is the CTA tile M size
+        :type tile_idx_to_group_idx: cute.Tensor
+        :param num_non_exiting_tiles: Number of valid tiles (valid_m/cta_tile_m), shape (1,)
+        :type num_non_exiting_tiles: cute.Tensor
+        :param alpha: Alpha tensor for each group
+        :type alpha: cute.Tensor
+        :param max_active_clusters: Maximum number of active clusters
+        :type max_active_clusters: cutlass.Constexpr
+        :param stream: CUDA stream for asynchronous execution
+        :type stream: cuda.CUstream
+        :param epilogue_op: Optional elementwise lambda function to apply to the output tensor
+        :type epilogue_op: cutlass.Constexpr
+        :raises TypeError: If input data types are incompatible with the MMA instruction.
+        """
+        # Setup static attributes before smem/grid/tma computation
+        self.a_dtype: Type[cutlass.Numeric] = a.element_type
+        self.b_dtype: Type[cutlass.Numeric] = b.element_type
+        self.c_dtype: Type[cutlass.Numeric] = c.element_type
+        self.sf_dtype: Type[cutlass.Numeric] = sfa.element_type
+        self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode()
+        self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode()
+        self.c_layout = utils.LayoutEnum.from_tensor(c)
+
+        # Check if input data types are compatible with MMA instruction
+        if cutlass.const_expr(self.a_dtype != self.b_dtype):
+            raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
+
+        # Setup attributes that dependent on gemm inputs
+        self._setup_attributes()
+
+        # Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
+        # ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
+        sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(a.shape, self.sf_vec_size)
+        sfa = cute.make_tensor(sfa.iterator, sfa_layout)
+
+        # ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
+        sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(b.shape, self.sf_vec_size)
+        sfb = cute.make_tensor(sfb.iterator, sfb_layout)
+
+        tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            self.cta_group,
+            self.mma_inst_shape_mn,
+        )
+
+        tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
+            self.a_dtype,
+            self.a_major_mode,
+            self.b_major_mode,
+            self.sf_dtype,
+            self.sf_vec_size,
+            cute.nvgpu.tcgen05.CtaGroup.ONE,
+            self.mma_inst_shape_mn_sfb,
+        )
+        atom_thr_size = cute.size(tiled_mma.thr_id.shape)
+
+        # Setup TMA load for A
+        a_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
+        tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+            a_op,
+            a,
+            a_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for B
+        b_op = sm100_utils.cluster_shape_to_tma_atom_B(self.cluster_shape_mn, tiled_mma.thr_id)
+        b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
+        tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+            b_op,
+            b,
+            b_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+        )
+
+        # Setup TMA load for SFA
+        sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfa_smem_layout = cute.slice_(self.sfa_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
+            sfa_op,
+            sfa,
+            sfa_smem_layout,
+            self.mma_tiler,
+            tiled_mma,
+            self.cluster_layout_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        # Setup TMA load for SFB
+        sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(self.cluster_shape_mn, tiled_mma.thr_id)
+        sfb_smem_layout = cute.slice_(self.sfb_smem_layout_staged, (None, None, None, 0))
+        tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
+            sfb_op,
+            sfb,
+            sfb_smem_layout,
+            self.mma_tiler_sfb,
+            tiled_mma_sfb,
+            self.cluster_layout_sfb_vmnk.shape,
+            internal_type=cutlass.Int16,
+        )
+
+        if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+            x = tma_tensor_sfb.stride[0][1]
+            y = cute.ceil_div(tma_tensor_sfb.shape[0][1], 4)
+
+            new_shape = (
+                (tma_tensor_sfb.shape[0][0], ((2, 2), y)),
+                tma_tensor_sfb.shape[1],
+                tma_tensor_sfb.shape[2],
+            )
+            # Use right multiplication for ScaledBasis (3 * x instead of x * 3)
+            x_times_3 = 3 * x
+            new_stride = (
+                (tma_tensor_sfb.stride[0][0], ((x, x), x_times_3)),
+                tma_tensor_sfb.stride[1],
+                tma_tensor_sfb.stride[2],
+            )
+            tma_tensor_sfb_new_layout = cute.make_layout(new_shape, stride=new_stride)
+            tma_tensor_sfb = cute.make_tensor(tma_tensor_sfb.iterator, tma_tensor_sfb_new_layout)
+
+        a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
+        b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
+        sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
+        sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
+        self.num_tma_load_bytes = (
+            a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size
+        ) * atom_thr_size
+
+        # Setup TMA store for C
+        tma_atom_c = None
+        tma_tensor_c = None
+        epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
+        tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
+            cpasync.CopyBulkTensorTileS2GOp(),
+            c,
+            epi_smem_layout,
+            self.epi_tile,
+        )
+
+        # Compute grid size
+        self.tile_sched_params, grid = self._compute_grid(
+            c, self.cta_tile_shape_mnk, self.cluster_shape_mn, max_active_clusters
+        )
+
+        self.buffer_align_bytes = 1024
+
+        # Define shared storage for kernel
+        @cute.struct
+        class SharedStorage:
+            # (bidx, bidy, bidz, valid)
+            sInfo: cute.struct.Align[
+                cute.struct.MemRange[cutlass.Int32, 4 * self.num_tile_stage],
+                # 1 byte alignment
+                1,
+            ]
+            ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+            acc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+            tile_info_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_tile_stage * 2]
+            tmem_dealloc_mbar_ptr: cutlass.Int64
+            tmem_holding_buf: cutlass.Int32
+            # (EPI_TILE_M, EPI_TILE_N, STAGE)
+            sC: cute.struct.Align[
+                cute.struct.MemRange[
+                    self.c_dtype,
+                    cute.cosize(self.c_smem_layout_staged.outer),
+                ],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_M, MMA_K, STAGE)
+            sA: cute.struct.Align[
+                cute.struct.MemRange[self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (MMA, MMA_N, MMA_K, STAGE)
+            sB: cute.struct.Align[
+                cute.struct.MemRange[self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+            sSFA: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+            # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+            sSFB: cute.struct.Align[
+                cute.struct.MemRange[self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)],
+                self.buffer_align_bytes,
+            ]
+
+        self.shared_storage = SharedStorage
+
+        # Launch the kernel synchronously
+        self.kernel(
+            tiled_mma,
+            tiled_mma_sfb,
+            tma_atom_a,
+            tma_tensor_a,
+            tma_atom_b,
+            tma_tensor_b,
+            tma_atom_sfa,
+            tma_tensor_sfa,
+            tma_atom_sfb,
+            tma_tensor_sfb,
+            tma_atom_c,
+            tma_tensor_c,
+            tile_idx_to_group_idx,
+            num_non_exiting_tiles,
+            alpha,
+            self.cluster_layout_vmnk,
+            self.cluster_layout_sfb_vmnk,
+            self.a_smem_layout_staged,
+            self.b_smem_layout_staged,
+            self.sfa_smem_layout_staged,
+            self.sfb_smem_layout_staged,
+            self.c_smem_layout_staged,
+            self.epi_tile,
+            self.tile_sched_params,
+            epilogue_op,
+        ).launch(
+            grid=grid,
+            block=[self.threads_per_cta, 1, 1],
+            cluster=(*self.cluster_shape_mn, 1),
+            smem=self.shared_storage.size_in_bytes(),
+            stream=stream,
+            min_blocks_per_mp=1,
+        )
+        return
+
+    def mainloop_s2t_copy_and_partition(
+        self,
+        sSF: cute.Tensor,
+        tSF: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for smem to tmem load for scale factor tensor, then use it to partition smem memory (source)
+        and tensor memory (destination).
+
+        :param sSF: The scale factor tensor in smem
+        :type sSF: cute.Tensor
+        :param tSF: The scale factor tensor in tmem
+        :type tSF: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
+            - tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
+            - tCsSF_compact_s2t: The partitioned scale factor tensor in smem
+            - tSF_compact_s2t: The partitioned scale factor tensor in tmem
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # (MMA, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact = cute.filter_zeros(sSF)
+        # (MMA, MMA_MN, MMA_K)
+        tCtSF_compact = cute.filter_zeros(tSF)
+
+        # Make S2T CopyAtom and tiledCopy
+        copy_atom_s2t = cute.make_copy_atom(
+            tcgen05.Cp4x32x128bOp(self.cta_group),
+            self.sf_dtype,
+        )
+        tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
+        thr_copy_s2t = tiled_copy_s2t.get_slice(0)
+
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
+        tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(tiled_copy_s2t, tCsSF_compact_s2t_)
+        # ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
+        tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
+
+        return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
+
+    # GPU device kernel
+    @cute.kernel
+    def kernel(
+        self,
+        tiled_mma: cute.TiledMma,
+        tiled_mma_sfb: cute.TiledMma,
+        tma_atom_a: cute.CopyAtom,
+        mA_mkl: cute.Tensor,
+        tma_atom_b: cute.CopyAtom,
+        mB_nkl: cute.Tensor,
+        tma_atom_sfa: cute.CopyAtom,
+        mSFA_mkl: cute.Tensor,
+        tma_atom_sfb: cute.CopyAtom,
+        mSFB_nkl: cute.Tensor,
+        tma_atom_c: cute.CopyAtom,
+        mC_mnl: cute.Tensor,
+        tile_idx_to_group_idx: cute.Tensor,
+        num_non_exiting_tiles: cute.Tensor,
+        alpha: cute.Tensor,
+        cluster_layout_vmnk: cute.Layout,
+        cluster_layout_sfb_vmnk: cute.Layout,
+        a_smem_layout_staged: cute.ComposedLayout,
+        b_smem_layout_staged: cute.ComposedLayout,
+        sfa_smem_layout_staged: cute.Layout,
+        sfb_smem_layout_staged: cute.Layout,
+        c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
+        epi_tile: cute.Tile,
+        tile_sched_params: utils.PersistentTileSchedulerParams,
+        epilogue_op: cutlass.Constexpr,
+    ):
+        """
+        GPU device kernel performing the Persistent batched GEMM computation.
+        """
+        warp_idx = cute.arch.warp_idx()
+        warp_idx = cute.arch.make_warp_uniform(warp_idx)
+
+        #
+        # Prefetch tma desc
+        #
+        if warp_idx == self.tma_warp_id:
+            cpasync.prefetch_descriptor(tma_atom_a)
+            cpasync.prefetch_descriptor(tma_atom_b)
+            cpasync.prefetch_descriptor(tma_atom_sfa)
+            cpasync.prefetch_descriptor(tma_atom_sfb)
+            cpasync.prefetch_descriptor(tma_atom_c)
+
+        use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
+
+        #
+        # Setup cta/thread coordinates
+        #
+        # Coords inside cluster
+        bidx, bidy, bidz = cute.arch.block_idx()
+        mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
+        is_leader_cta = mma_tile_coord_v == 0
+        cta_rank_in_cluster = cute.arch.make_warp_uniform(cute.arch.block_idx_in_cluster())
+        block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+
+        block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
+            cta_rank_in_cluster
+        )
+
+        # Coord inside cta
+        tidx, _, _ = cute.arch.thread_idx()
+
+        #
+        # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
+        #
+        smem = utils.SmemAllocator()
+        storage = smem.allocate(self.shared_storage)
+
+        # Initialize mainloop ab_pipeline (barrier) and states
+        ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
+        ab_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_tma_producer
+        )
+        ab_pipeline = pipeline.PipelineTmaUmma.create(
+            barrier_storage=storage.ab_mbar_ptr.data_ptr(),
+            num_stages=self.num_ab_stage,
+            producer_group=ab_pipeline_producer_group,
+            consumer_group=ab_pipeline_consumer_group,
+            tx_count=self.num_tma_load_bytes,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize acc_pipeline (barrier) and states
+        acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
+        num_acc_consumer_threads = len(self.epilog_warp_id) * (2 if use_2cta_instrs else 1)
+        acc_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, num_acc_consumer_threads
+        )
+        acc_pipeline = pipeline.PipelineUmmaAsync.create(
+            barrier_storage=storage.acc_mbar_ptr.data_ptr(),
+            num_stages=self.num_acc_stage,
+            producer_group=acc_pipeline_producer_group,
+            consumer_group=acc_pipeline_consumer_group,
+            cta_layout_vmnk=cluster_layout_vmnk,
+        )
+
+        # Initialize tile info pipeline (barrier) and states
+        tile_info_pipeline_producer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_per_warp * 1,
+        )
+        tile_info_pipeline_consumer_group = pipeline.CooperativeGroup(
+            pipeline.Agent.Thread,
+            self.threads_wo_sched,
+        )
+        tile_info_pipeline = pipeline.PipelineAsync.create(
+            barrier_storage=storage.tile_info_mbar_ptr.data_ptr(),
+            num_stages=self.num_tile_stage,
+            producer_group=tile_info_pipeline_producer_group,
+            consumer_group=tile_info_pipeline_consumer_group,
+        )
+
+        # Tensor memory dealloc barrier init
+        tmem = utils.TmemAllocator(
+            storage.tmem_holding_buf,
+            barrier_for_retrieve=self.tmem_alloc_barrier,
+            allocator_warp_id=self.epilog_warp_id[0],
+            is_two_cta=use_2cta_instrs,
+            two_cta_tmem_dealloc_mbar_ptr=storage.tmem_dealloc_mbar_ptr,
+        )
+
+        # Cluster arrive after barrier init
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_arrive_relaxed()
+
+        #
+        # Setup smem tensor A/B/C/Scale
+        #
+        # (EPI_TILE_M, EPI_TILE_N, STAGE)
+        sC = storage.sC.get_tensor(c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner)
+        # (MMA, MMA_M, MMA_K, STAGE)
+        sA = storage.sA.get_tensor(a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        sB = storage.sB.get_tensor(b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner)
+        # (granularity_m, repeat_m), (granularity_k, repeat_k), num_scale_stage)
+        sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
+        # (granularity_n, repeat_n), (granularity_k, repeat_k), num_scale_stage)
+        sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
+        # (bidx, bidy, bidz, valid)
+        info_layout = cute.make_layout((4, self.num_tile_stage), stride=(1, 4))
+        sInfo = storage.sInfo.get_tensor(info_layout)
+
+        #
+        # Compute multicast mask for A/B buffer full
+        #
+        a_full_mcast_mask = None
+        b_full_mcast_mask = None
+        sfa_full_mcast_mask = None
+        sfb_full_mcast_mask = None
+        if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
+            a_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            b_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
+            )
+            sfa_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
+            )
+            sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(
+                cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1
+            )
+
+        #
+        # Local_tile partition global tensors
+        #
+        # (bM, bK, loopM, loopK, loopL)
+        gA_mkl = cute.local_tile(
+            mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+        # (bN, bK, loopN, loopK, loopL)
+        gB_nkl = cute.local_tile(
+            mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
+        )
+
+        # (bM, bK, RestM, RestK, RestL)
+        gSFA_mkl = cute.local_tile(
+            mSFA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
+        )
+
+        # (bN, bK, RestN, RestK, RestL)
+        gSFB_nkl = cute.local_tile(
+            mSFB_nkl,
+            cute.slice_(self.mma_tiler_sfb, (0, None, None)),
+            (None, None, None),
+        )
+
+        # (bM, bN, loopM, loopN, loopL)
+        gC_mnl = cute.local_tile(
+            mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None)
+        )
+        k_tile_cnt = cute.size(gA_mkl, mode=[3])
+
+        #
+        # Partition global tensor for TiledMMA_A/B/C
+        #
+        thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
+        thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
+        # (MMA, MMA_M, MMA_K, loopM, loopK, loopL)
+        tCgA = thr_mma.partition_A(gA_mkl)
+        # (MMA, MMA_N, MMA_K, loopN, loopK, loopL)
+        tCgB = thr_mma.partition_B(gB_nkl)
+        # (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
+        tCgSFA = thr_mma.partition_A(gSFA_mkl)
+        # (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
+        tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
+        # (MMA, MMA_M, MMA_N, loopM, loopN, loopL)
+        tCgC = thr_mma.partition_C(gC_mnl)
+
+        #
+        # Partition global/shared tensor for TMA load A/B
+        #
+        # TMA load A partition_S/D
+        a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tAsA, tAgA = cpasync.tma_partition(
+            tma_atom_a,
+            block_in_cluster_coord_vmnk[2],
+            a_cta_layout,
+            cute.group_modes(sA, 0, 3),
+            cute.group_modes(tCgA, 0, 3),
+        )
+        # TMA load B partition_S/D
+        b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), loopM, loopK, loopL)
+        tBsB, tBgB = cpasync.tma_partition(
+            tma_atom_b,
+            block_in_cluster_coord_vmnk[1],
+            b_cta_layout,
+            cute.group_modes(sB, 0, 3),
+            cute.group_modes(tCgB, 0, 3),
+        )
+
+        #  TMA load SFA partition_S/D
+        sfa_cta_layout = a_cta_layout
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestM, RestK, RestL)
+
+        tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfa,
+            block_in_cluster_coord_vmnk[2],
+            sfa_cta_layout,
+            cute.group_modes(sSFA, 0, 3),
+            cute.group_modes(tCgSFA, 0, 3),
+        )
+
+        tAsSFA = cute.filter_zeros(tAsSFA)
+        tAgSFA = cute.filter_zeros(tAgSFA)
+
+        # TMA load SFB partition_S/D
+        sfb_cta_layout = cute.make_layout(
+            cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape
+        )
+        # ((atom_v, rest_v), STAGE)
+        # ((atom_v, rest_v), RestN, RestK, RestL)
+        tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
+            tma_atom_sfb,
+            block_in_cluster_coord_sfb_vmnk[1],
+            sfb_cta_layout,
+            cute.group_modes(sSFB, 0, 3),
+            cute.group_modes(tCgSFB, 0, 3),
+        )
+        tBsSFB = cute.filter_zeros(tBsSFB)
+        tBgSFB = cute.filter_zeros(tBgSFB)
+
+        #
+        # Partition shared/tensor memory tensor for TiledMMA_A/B/C
+        #
+        # (MMA, MMA_M, MMA_K, STAGE)
+        tCrA = tiled_mma.make_fragment_A(sA)
+        # (MMA, MMA_N, MMA_K, STAGE)
+        tCrB = tiled_mma.make_fragment_B(sB)
+        # (MMA, MMA_M, MMA_N)
+        acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
+        # (MMA, MMA_M, MMA_N, STAGE)
+        tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, self.num_acc_stage))
+
+        #
+        # Cluster wait before tensor memory alloc
+        #
+        if cute.size(self.cluster_shape_mn) > 1:
+            cute.arch.cluster_wait()
+        else:
+            self.cta_sync_barrier.arrive_and_wait()
+
+        #
+        # Specialized Schedule warp
+        #
+        if warp_idx == self.sched_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_sched_warps)
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            tile_info_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_tile_stage
+            )
+
+            while work_tile.is_valid_tile:
+                cur_tile_coord = work_tile.tile_idx
+                if cur_tile_coord[0] < num_non_exiting_tiles[0]:
+                    tile_info_pipeline.producer_acquire(tile_info_producer_state)
+                    cur_tile_coord = work_tile.tile_idx
+                    group_idx = tile_idx_to_group_idx[cur_tile_coord[0]]
+                    with cute.arch.elect_one():
+                        sInfo[(0, tile_info_producer_state.index)] = cur_tile_coord[0]
+                        sInfo[(1, tile_info_producer_state.index)] = cur_tile_coord[1]
+                        sInfo[(2, tile_info_producer_state.index)] = group_idx
+                        sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(
+                            work_tile.is_valid_tile
+                        )
+                        # fence view async shared
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+
+                    self.sched_sync_barrier.arrive_and_wait()
+                    tile_info_pipeline.producer_commit(tile_info_producer_state)
+                    tile_info_producer_state.advance()
+
+                tile_sched.advance_to_next_work()
+                work_tile = tile_sched.get_current_work()
+
+            tile_info_pipeline.producer_acquire(tile_info_producer_state)
+            with cute.arch.elect_one():
+                sInfo[(0, tile_info_producer_state.index)] = work_tile.tile_idx[0]
+                sInfo[(1, tile_info_producer_state.index)] = work_tile.tile_idx[1]
+                sInfo[(2, tile_info_producer_state.index)] = -1
+                sInfo[(3, tile_info_producer_state.index)] = cutlass.Int32(0)
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            self.sched_sync_barrier.arrive_and_wait()
+            tile_info_pipeline.producer_commit(tile_info_producer_state)
+            tile_info_producer_state.advance()
+            tile_info_pipeline.producer_tail(tile_info_producer_state)
+
+        #
+        # Specialized TMA load warp
+        #
+        if warp_idx == self.tma_warp_id:
+            cute.arch.warpgroup_reg_dealloc(self.num_regs_uniform_warps)
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            # First tile
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_ab_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Slice to per mma tile index
+                #
+                # ((atom_v, rest_v), loopK)
+                tAgA_slice = tAgA[(None, mma_tile_coord_mnl[0], None, 0)]
+                # ((atom_v, rest_v), loopK)
+                tBgB_slice = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])]
+
+                # ((atom_v, rest_v), RestK)
+                tAgSFA_slice = tAgSFA[(None, mma_tile_coord_mnl[0], None, 0)]
+
+                slice_n = mma_tile_coord_mnl[1]
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    slice_n = mma_tile_coord_mnl[1] // 2
+
+                # ((atom_v, rest_v), RestK)
+                tBgSFB_slice = tBgSFB[(None, slice_n, None, mma_tile_coord_mnl[2])]
+
+                # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt
+                ab_producer_state.reset_count()
+                peek_ab_empty_status = cutlass.Boolean(1)
+                if ab_producer_state.count < k_tile_cnt:
+                    peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+                #
+                # Tma load loop
+                #
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):
+                    tAgA_k = tAgA_slice[(None, ab_producer_state.count)]
+                    tBgB_k = tBgB_slice[(None, ab_producer_state.count)]
+                    tAgSFA_k = tAgSFA_slice[(None, ab_producer_state.count)]
+                    tBgSFB_k = tBgSFB_slice[(None, ab_producer_state.count)]
+                    tAsA_pipe = tAsA[(None, ab_producer_state.index)]
+                    tBsB_pipe = tBsB[(None, ab_producer_state.index)]
+                    tAsSFA_pipe = tAsSFA[(None, ab_producer_state.index)]
+                    tBsSFB_pipe = tBsSFB[(None, ab_producer_state.index)]
+
+                    tma_bar = ab_pipeline.producer_get_barrier(ab_producer_state)
+
+                    # Conditionally wait for AB buffer empty
+                    ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status)
+
+                    # TMA load A/B
+                    cute.copy(
+                        tma_atom_a,
+                        tAgA_k,
+                        tAsA_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=a_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tBgB_k,
+                        tBsB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=b_full_mcast_mask,
+                    )
+
+                    cute.copy(
+                        tma_atom_sfa,
+                        tAgSFA_k,
+                        tAsSFA_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfa_full_mcast_mask,
+                    )
+                    cute.copy(
+                        tma_atom_sfb,
+                        tBgSFB_k,
+                        tBsSFB_pipe,
+                        tma_bar_ptr=tma_bar,
+                        mcast_mask=sfb_full_mcast_mask,
+                    )
+
+                    # Peek (try_wait) AB buffer empty for k_tile = prefetch_k_tile_cnt + k_tile + 1
+                    ab_producer_state.advance()
+                    peek_ab_empty_status = cutlass.Boolean(1)
+                    if ab_producer_state.count < k_tile_cnt:
+                        peek_ab_empty_status = ab_pipeline.producer_try_acquire(ab_producer_state)
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait A/B buffer empty
+            #
+            ab_pipeline.producer_tail(ab_producer_state)
+
+        #
+        # Specialized MMA warp
+        #
+        if warp_idx == self.mma_warp_id:
+            #
+            # Bar sync for retrieve tensor memory ptr from shared mem
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            acc_tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
+
+            # Make SFA tmem tensor
+            sfa_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_M, MMA_K)
+            tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
+
+            # Make SFB tmem tensor
+            sfb_tmem_ptr = cute.recast_ptr(
+                acc_tmem_ptr
+                + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
+                + tcgen05.find_tmem_tensor_col_offset(tCtSFA),
+                dtype=self.sf_dtype,
+            )
+            # (MMA, MMA_N, MMA_K)
+            tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
+                tiled_mma,
+                self.mma_tiler,
+                self.sf_vec_size,
+                cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
+            )
+            tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
+
+            # Partition for S2T copy of SFA/SFB
+            #
+            (
+                tiled_copy_s2t_sfa,
+                tCsSFA_compact_s2t,
+                tCtSFA_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
+            (
+                tiled_copy_s2t_sfb,
+                tCsSFB_compact_s2t,
+                tCtSFB_compact_s2t,
+            ) = self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            ab_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+            )
+            acc_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info from pipeline (scheduler has filtered out tiles >= num_non_exiting_tiles)
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                # Peek (try_wait) AB buffer full for k_tile = 0
+                ab_consumer_state.reset_count()
+                peek_ab_full_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                # Peek (try_wait) Acc buffer empty for k_tile = 0
+                acc_producer_state.reset_count()
+                peek_acc_empty_status = cutlass.Boolean(1)
+                if ab_consumer_state.count < k_tile_cnt and is_leader_cta:
+                    peek_acc_empty_status = acc_pipeline.producer_try_acquire(acc_producer_state)
+
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+
+                tCtAcc = tCtAcc_base[(None, None, None, acc_producer_state.index)]
+
+                tCtSFB_mma = tCtSFB
+                if cutlass.const_expr(self.cta_tile_shape_mnk[1] == 192):
+                    # If this is an ODD tile, shift the TMEM start address for cta_tile_shape_n=192 case by two words
+                    # (ignores first 64 columns of SFB)
+                    offset = (
+                        cutlass.Int32(2) if mma_tile_coord_mnl[1] % 2 == 1 else cutlass.Int32(0)
+                    )
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
+                        + tcgen05.find_tmem_tensor_col_offset(tCtSFA)
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                elif cutlass.const_expr(self.cta_tile_shape_mnk[1] == 64):
+                    # Move in increments of 64 columns of SFB
+                    offset = cutlass.Int32((mma_tile_coord_mnl[1] % 2) * 2)
+                    shifted_ptr = cute.recast_ptr(
+                        acc_tmem_ptr
+                        + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
+                        + tcgen05.find_tmem_tensor_col_offset(tCtSFA)
+                        + offset,
+                        dtype=self.sf_dtype,
+                    )
+                    tCtSFB_mma = cute.make_tensor(shifted_ptr, tCtSFB_layout)
+                    #
+                # Wait for accumulator buffer empty
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_acquire(acc_producer_state, peek_acc_empty_status)
+                #
+                # Mma mainloop
+                #
+
+                #
+                # Reset the ACCUMULATE field for each tile
+                #
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+
+                for k_tile in cutlass.range(0, k_tile_cnt, 1, unroll=1):
+                    # Set tensor memory buffer for current tile
+                    # (MMA, MMA_M, MMA_N)
+
+                    if is_leader_cta:
+                        # Conditionally wait for AB buffer full
+                        ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status)
+
+                        #  Copy SFA/SFB from smem to tmem
+                        s2t_stage_coord = (
+                            None,
+                            None,
+                            None,
+                            None,
+                            ab_consumer_state.index,
+                        )
+                        tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
+                        tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
+                        cute.copy(
+                            tiled_copy_s2t_sfa,
+                            tCsSFA_compact_s2t_staged,
+                            tCtSFA_compact_s2t,
+                        )
+                        cute.copy(
+                            tiled_copy_s2t_sfb,
+                            tCsSFB_compact_s2t_staged,
+                            tCtSFB_compact_s2t,
+                        )
+
+                        # tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
+                        num_kblocks = cute.size(tCrA, mode=[2])
+
+                        for kblock_idx in cutlass.range(num_kblocks, unroll_full=True):
+                            kblock_coord = (
+                                None,
+                                None,
+                                kblock_idx,
+                                ab_consumer_state.index,
+                            )
+
+                            # Set SFA/SFB tensor to tiled_mma
+                            sf_kblock_coord = (None, None, kblock_idx)
+                            tiled_mma.set(
+                                tcgen05.Field.SFA,
+                                tCtSFA[sf_kblock_coord].iterator,
+                            )
+                            tiled_mma.set(
+                                tcgen05.Field.SFB,
+                                tCtSFB_mma[sf_kblock_coord].iterator,
+                            )
+
+                            cute.gemm(
+                                tiled_mma,
+                                tCtAcc,
+                                tCrA[kblock_coord],
+                                tCrB[kblock_coord],
+                                tCtAcc,
+                            )
+                            # Enable accumulate on tCtAcc after first kblock
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        # Async arrive AB buffer empty
+                        ab_pipeline.consumer_release(ab_consumer_state)
+
+                    # Peek (try_wait) AB buffer full for k_tile = k_tile + 1
+                    ab_consumer_state.advance()
+                    peek_ab_full_status = cutlass.Boolean(1)
+                    if ab_consumer_state.count < k_tile_cnt:
+                        if is_leader_cta:
+                            peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state)
+
+                #
+                # Async arrive accumulator buffer full(each kblock)
+                #
+                if is_leader_cta:
+                    acc_pipeline.producer_commit(acc_producer_state)
+
+                # Peek (try_wait) Acc buffer empty for k_tile = k_tile + 1
+                acc_producer_state.advance()
+                if acc_producer_state.count < k_tile_cnt:
+                    if is_leader_cta:
+                        peek_acc_empty_status = acc_pipeline.producer_try_acquire(
+                            acc_producer_state
+                        )
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Wait for accumulator buffer empty
+            #
+            acc_pipeline.producer_tail(acc_producer_state)
+
+        #
+        # Specialized epilogue warps
+        #
+        if warp_idx < self.mma_warp_id:
+            #
+            # Alloc tensor memory buffer
+            #
+            tmem.allocate(self.num_tmem_alloc_cols)
+
+            #
+            # Bar sync for retrieve tensor memory ptr from shared memory
+            #
+            tmem.wait_for_alloc()
+
+            #
+            # Retrieving tensor memory ptr and make accumulator tensor
+            #
+            tmem_ptr = tmem.retrieve_ptr(self.acc_dtype)
+            # (MMA, MMA_M, MMA_N, STAGE)
+            tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout)
+
+            #
+            # Partition for epilogue
+            #
+            epi_tidx = tidx
+            (
+                tiled_copy_t2r,
+                tTR_tAcc_base,
+                tTR_rAcc,
+            ) = self.epilog_tmem_copy_and_partition(
+                epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
+            )
+
+            tTR_rC = None
+            tiled_copy_r2s = None
+            tRS_rC = None
+            tRS_sC = None
+            bSG_sC = None
+            bSG_gC_partitioned = None
+            tTR_rC = cute.make_rmem_tensor(tTR_rAcc.shape, self.c_dtype)
+            tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
+                tiled_copy_t2r, tTR_rC, epi_tidx, sC
+            )
+            (
+                tma_atom_c,
+                bSG_sC,
+                bSG_gC_partitioned,
+            ) = self.epilog_gmem_copy_and_partition(epi_tidx, tma_atom_c, tCgC, epi_tile, sC)
+
+            #
+            # Persistent tile scheduling loop
+            #
+            tile_sched = utils.StaticPersistentTileScheduler.create(
+                tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
+            )
+            work_tile = tile_sched.initial_work_tile_info()
+
+            acc_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            c_pipeline = None
+            # Threads/warps participating in tma store pipeline
+            c_producer_group = pipeline.CooperativeGroup(
+                pipeline.Agent.Thread,
+                32 * len(self.epilog_warp_id),
+            )
+            c_pipeline = pipeline.PipelineTmaStore.create(
+                num_stages=self.num_c_stage,
+                producer_group=c_producer_group,
+            )
+
+            tile_info_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_tile_stage
+            )
+
+            # Get the first tile info
+            tile_info = cute.make_rmem_tensor((4,), cutlass.Int32)
+
+            tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+            for idx in cutlass.range(4, unroll_full=True):
+                tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+            is_valid_tile = tile_info[3] == 1
+            cute.arch.fence_proxy(
+                cute.arch.ProxyKind.async_shared,
+                space=cute.arch.SharedSpace.shared_cta,
+            )
+            tile_info_pipeline.consumer_release(tile_info_consumer_state)
+            tile_info_consumer_state.advance()
+
+            while is_valid_tile:
+                mma_tile_coord_mnl = (
+                    tile_info[0] // cute.size(tiled_mma.thr_id.shape),
+                    tile_info[1],
+                    tile_info[2],
+                )
+                #
+                # Get alpha for current group
+                #
+
+                group_idx = mma_tile_coord_mnl[2]
+                alpha_val = alpha[group_idx]
+
+                #
+                # Slice to per mma tile index
+                #
+                bSG_gC = None
+                # ((ATOM_V, REST_V), EPI_M, EPI_N)
+                bSG_gC = bSG_gC_partitioned[
+                    (
+                        None,
+                        None,
+                        None,
+                        mma_tile_coord_mnl[0],
+                        mma_tile_coord_mnl[1],
+                        0,
+                    )
+                ]
+
+                # Set tensor memory buffer for current tile
+                # (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
+                tTR_tAcc = tTR_tAcc_base[(None, None, None, None, None, acc_consumer_state.index)]
+
+                #
+                # Wait for accumulator buffer full
+                #
+                acc_pipeline.consumer_wait(acc_consumer_state)
+
+                tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
+                bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
+
+                #
+                # Store accumulator to global memory in sub-tiles
+                #
+                subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
+                num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
+
+                for subtile_idx in cutlass.range(subtile_cnt):
+                    #
+                    # Load accumulator from tensor memory buffer to register
+                    #
+                    tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)]
+                    cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
+
+                    #
+                    # Apply alpha and convert to C type
+                    #
+                    acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
+                    acc_vec = epilogue_op((alpha_val * acc_vec).to(self.c_dtype))
+                    tRS_rC.store(acc_vec)
+
+                    #
+                    # Store C to shared memory
+                    #
+                    c_buffer = (num_prev_subtiles + subtile_idx) % self.num_c_stage
+
+                    cute.copy(
+                        tiled_copy_r2s,
+                        tRS_rC,
+                        tRS_sC[(None, None, None, c_buffer)],
+                    )
+
+                    # Fence and barrier to make sure shared memory store is visible to TMA store
+                    cute.arch.fence_proxy(
+                        cute.arch.ProxyKind.async_shared,
+                        space=cute.arch.SharedSpace.shared_cta,
+                    )
+                    self.epilog_sync_barrier.arrive_and_wait()
+                    #
+                    # TMA store C to global memory
+                    #
+                    if warp_idx == self.epilog_warp_id[0]:
+                        cute.copy(
+                            tma_atom_c,
+                            bSG_sC[(None, c_buffer)],
+                            bSG_gC[(None, subtile_idx)],
+                        )
+                        # Fence and barrier to make sure shared memory store is visible to TMA store
+                        c_pipeline.producer_commit()
+                        c_pipeline.producer_acquire()
+                    self.epilog_sync_barrier.arrive_and_wait()
+
+                #
+                # Async arrive accumulator buffer empty
+                #
+                with cute.arch.elect_one():
+                    acc_pipeline.consumer_release(acc_consumer_state)
+                acc_consumer_state.advance()
+
+                #
+                # Advance to next tile
+                #
+                tile_info_pipeline.consumer_wait(tile_info_consumer_state)
+                for idx in cutlass.range(4, unroll_full=True):
+                    tile_info[idx] = sInfo[(idx, tile_info_consumer_state.index)]
+                is_valid_tile = tile_info[3] == 1
+                cute.arch.fence_proxy(
+                    cute.arch.ProxyKind.async_shared,
+                    space=cute.arch.SharedSpace.shared_cta,
+                )
+                tile_info_pipeline.consumer_release(tile_info_consumer_state)
+                tile_info_consumer_state.advance()
+            #
+            # Dealloc the tensor memory buffer
+            #
+            tmem.relinquish_alloc_permit()
+            self.epilog_sync_barrier.arrive_and_wait()
+            tmem.free(tmem_ptr)
+            #
+            # Wait for C store complete
+            #
+            c_pipeline.producer_tail()
+
+    def epilog_tmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        tAcc: cute.Tensor,
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        use_2cta_instrs: Union[cutlass.Boolean, bool],
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array
+        (destination).
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param tAcc: The accumulator tensor to be copied and partitioned
+        :type tAcc: cute.Tensor
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param use_2cta_instrs: Whether use_2cta_instrs is enabled
+        :type use_2cta_instrs: bool
+
+        :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
+            - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+            - tTR_tAcc: The partitioned accumulator tensor
+            - tTR_rAcc: The accumulated tensor in register used to hold t2r results
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        # Make tiledCopy for tensor memory load
+        copy_atom_t2r = sm100_utils.get_tmem_load_op(
+            self.cta_tile_shape_mnk,
+            self.c_layout,
+            self.c_dtype,
+            self.acc_dtype,
+            epi_tile,
+            use_2cta_instrs,
+        )
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
+        tAcc_epi = cute.flat_divide(
+            tAcc[((None, None), 0, 0, None)],
+            epi_tile,
+        )
+        # (EPI_TILE_M, EPI_TILE_N)
+        tiled_copy_t2r = tcgen05.make_tmem_copy(copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)])
+
+        thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
+        tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_mnl_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+        # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
+
+        # (T2R, T2R_M, T2R_N)
+        tTR_rAcc = cute.make_rmem_tensor(
+            tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
+        )
+
+        return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
+
+    def epilog_smem_copy_and_partition(
+        self,
+        tiled_copy_t2r: cute.TiledCopy,
+        tTR_rC: cute.Tensor,
+        tidx: cutlass.Int32,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
+        """
+        Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory
+        (destination).
+
+        :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
+        :type tiled_copy_t2r: cute.TiledCopy
+        :param tTR_rC: The partitioned accumulator tensor
+        :type tTR_rC: cute.Tensor
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+        :type sepi: cute.Tensor
+
+        :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
+            - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
+            - tRS_rC: The partitioned tensor C (register source)
+            - tRS_sC: The partitioned tensor C (smem destination)
+        :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
+        """
+        copy_atom_r2s = sm100_utils.get_smem_store_op(
+            self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
+        )
+        tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
+        # (R2S, R2S_M, R2S_N, PIPE_D)
+        thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
+        tRS_sC = thr_copy_r2s.partition_D(sC)
+        # (R2S, R2S_M, R2S_N)
+        tRS_rC = tiled_copy_r2s.retile(tTR_rC)
+        return tiled_copy_r2s, tRS_rC, tRS_sC
+
+    def epilog_gmem_copy_and_partition(
+        self,
+        tidx: cutlass.Int32,
+        atom: Union[cute.CopyAtom, cute.TiledCopy],
+        gC_mnl: cute.Tensor,
+        epi_tile: cute.Tile,
+        sC: cute.Tensor,
+    ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
+        """Make tiledCopy for global memory store, then use it to:
+        - partition register array (source) and global memory (destination) for none TMA store version;
+        - partition shared memory (source) and global memory (destination) for TMA store version.
+
+        :param tidx: The thread index in epilogue warp groups
+        :type tidx: cutlass.Int32
+        :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
+        :type atom: cute.CopyAtom or cute.TiledCopy
+        :param gC_mnl: The global tensor C
+        :type gC_mnl: cute.Tensor
+        :param epi_tile: The epilogue tiler
+        :type epi_tile: cute.Tile
+        :param sC: The shared memory tensor to be copied and partitioned
+        :type sC: cute.Tensor
+
+        :return: A tuple containing :
+            - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where:
+                - tma_atom_c: The TMA copy atom
+                - bSG_sC: The partitioned shared memory tensor C
+                - bSG_gC: The partitioned global tensor C
+        :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
+        """
+        # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL)
+        gC_epi = cute.flat_divide(gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile)
+        tma_atom_c = atom
+        sC_for_tma_partition = cute.group_modes(sC, 0, 2)
+        gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N)
+        # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL)
+        bSG_sC, bSG_gC = cpasync.tma_partition(
+            tma_atom_c,
+            0,
+            cute.make_layout(1),
+            sC_for_tma_partition,
+            gC_for_tma_partition,
+        )
+        return tma_atom_c, bSG_sC, bSG_gC
+
+    @staticmethod
+    def _compute_stages(
+        tiled_mma: cute.TiledMma,
+        mma_tiler_mnk: Tuple[int, int, int],
+        a_dtype: Type[cutlass.Numeric],
+        b_dtype: Type[cutlass.Numeric],
+        epi_tile: cute.Tile,
+        c_dtype: Type[cutlass.Numeric],
+        c_layout: utils.LayoutEnum,
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        num_smem_capacity: int,
+        occupancy: int,
+    ) -> Tuple[int, int, int]:
+        """Computes the number of stages for A/B/C operands based on heuristics.
+
+        :param tiled_mma: The tiled MMA object defining the core computation.
+        :type tiled_mma: cute.TiledMma
+        :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
+        :type mma_tiler_mnk: tuple[int, int, int]
+        :param a_dtype: Data type of operand A.
+        :type a_dtype: type[cutlass.Numeric]
+        :param b_dtype: Data type of operand B.
+        :type b_dtype: type[cutlass.Numeric]
+        :param epi_tile: The epilogue tile shape.
+        :type epi_tile: cute.Tile
+        :param c_dtype: Data type of operand C (output).
+        :type c_dtype: type[cutlass.Numeric]
+        :param c_layout: Layout of operand C.
+        :type c_layout: utils.LayoutEnum
+        :param sf_dtype: Data type of scale factor.
+        :type sf_dtype: type[cutlass.Numeric]
+        :param sf_vec_size: Vector size of scale factor.
+        :type sf_vec_size: int
+        :param num_smem_capacity: Total available shared memory capacity in bytes.
+        :type num_smem_capacity: int
+        :param occupancy: Target number of CTAs per SM (occupancy).
+        :type occupancy: int
+
+        :return: A tuple containing the computed number of stages for:
+                 (ACC stages, A/B operand stages, C stages)
+        :rtype: tuple[int, int, int]
+        """
+        # Default ACC stages
+        num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
+
+        # num_acc_stage = 1
+
+        # Default C stages
+        num_c_stage = 2
+
+        # Default Tile info stages
+        num_tile_stage = 2
+
+        # Calculate smem layout and size for one stage of A, B, and C
+        a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+            tiled_mma,
+            mma_tiler_mnk,
+            a_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+        b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
+            tiled_mma,
+            mma_tiler_mnk,
+            b_dtype,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
+            tiled_mma,
+            mma_tiler_mnk,
+            sf_vec_size,
+            1,  # a tmp 1 stage is provided
+        )
+
+        c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
+            c_dtype,
+            c_layout,
+            epi_tile,
+            1,
+        )
+
+        ab_bytes_per_stage = (
+            cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
+            + cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
+        )
+        # 1024B alignment
+        mbar_helpers_bytes = 1024
+        c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
+        c_bytes = c_bytes_per_stage * num_c_stage
+
+        # Calculate A/B stages:
+        # Start with total smem per CTA (capacity / occupancy)
+        # Subtract reserved bytes and initial C stages bytes
+        # Divide remaining by bytes needed per A/B stage
+        # cute.printf("num_smem_capacity: {}, occupancy: {}, mbar_helpers_bytes: {}, c_bytes: {}", num_smem_capacity,
+        # occupancy, mbar_helpers_bytes, c_bytes)
+        # cute.printf("ab_bytes_per_stage: {}", ab_bytes_per_stage)
+        num_ab_stage = (
+            num_smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes)
+        ) // ab_bytes_per_stage
+
+        # Refine epilogue stages:
+        # Calculate remaining smem after allocating for A/B stages and reserved bytes
+        # Add remaining unused smem to epilogue
+        num_c_stage += (
+            num_smem_capacity
+            - occupancy * ab_bytes_per_stage * num_ab_stage
+            - occupancy * (mbar_helpers_bytes + c_bytes)
+        ) // (occupancy * c_bytes_per_stage)
+
+        return num_acc_stage, num_ab_stage, num_c_stage, num_tile_stage
+
+    @staticmethod
+    def _compute_grid(
+        c: cute.Tensor,
+        cta_tile_shape_mnk: Tuple[int, int, int],
+        cluster_shape_mn: Tuple[int, int],
+        max_active_clusters: cutlass.Constexpr,
+    ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]:
+        """Use persistent tile scheduler to compute the grid size for the output tensor C.
+
+        :param c: The output tensor C
+        :type c: cute.Tensor
+        :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
+        :type cta_tile_shape_mnk: tuple[int, int, int]
+        :param cluster_shape_mn: Shape of each cluster in M, N dimensions.
+        :type cluster_shape_mn: tuple[int, int]
+        :param max_active_clusters: Maximum number of active clusters.
+        :type max_active_clusters: cutlass.Constexpr
+
+        :return: A tuple containing:
+            - tile_sched_params: Parameters for the persistent tile scheduler.
+            - grid: Grid shape for kernel launch.
+        :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]
+        """
+        c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
+        gc = cute.zipped_divide(c, tiler=c_shape)
+        num_ctas_mnl = gc[(0, (None, None, None))].shape
+        cluster_shape_mnl = (*cluster_shape_mn, 1)
+
+        tile_sched_params = utils.PersistentTileSchedulerParams(num_ctas_mnl, cluster_shape_mnl)
+        grid = utils.StaticPersistentTileScheduler.get_grid_shape(
+            tile_sched_params, max_active_clusters
+        )
+
+        return tile_sched_params, grid
+
+    @staticmethod
+    def _get_tma_atom_kind(
+        atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean
+    ) -> Union[cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp]:
+        """
+        Select the appropriate TMA copy atom based on the number of SMs and the multicast flag.
+
+        :param atom_sm_cnt: The number of SMs
+        :type atom_sm_cnt: cutlass.Int32
+        :param mcast: The multicast flag
+        :type mcast: cutlass.Boolean
+
+        :return: The appropriate TMA copy atom kind
+        :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp
+
+        :raise ValueError: If the atom_sm_cnt is invalid
+        """
+        if atom_sm_cnt == 2 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 2 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO)
+        elif atom_sm_cnt == 1 and mcast:
+            return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE)
+        elif atom_sm_cnt == 1 and not mcast:
+            return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE)
+
+        raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}")
+
+    @staticmethod
+    def is_valid_dtypes_and_scale_factor_vec_size(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        acc_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+    ) -> bool:
+        """
+        Check if the dtypes are valid
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param acc_dtype: The data type of the accumulator
+        :type acc_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+
+        :return: True if the dtypes are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+        if ab_dtype not in {
+            cutlass.Float4E2M1FN,
+            cutlass.Float8E5M2,
+            cutlass.Float8E4M3FN,
+        }:
+            is_valid = False
+
+        # Check valid sf_vec_size
+        if sf_vec_size not in {16, 32}:
+            is_valid = False
+
+        # Check valid sf_dtype
+        if sf_dtype not in {cutlass.Float8E8M0FNU, cutlass.Float8E4M3FN}:
+            is_valid = False
+
+        # Check valid sf_dtype and sf_vec_size combinations
+        if sf_dtype == cutlass.Float8E4M3FN and sf_vec_size == 32:
+            is_valid = False
+        if ab_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} and sf_vec_size == 16:
+            is_valid = False
+
+        if acc_dtype not in {cutlass.Float32}:
+            is_valid = False
+        if c_dtype not in {cutlass.Float32, cutlass.Float16, cutlass.BFloat16}:
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_layouts(
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if layouts and dtypes are valid combinations
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major dimension of the A tensor
+        :type a_major: str
+        :param b_major: The major dimension of the B tensor
+        :type b_major: str
+        :param c_major: The major dimension of the C tensor
+        :type c_major: str
+
+        :return: True if the layouts are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
+            is_valid = False
+        if c_dtype is cutlass.Float4E2M1FN and c_major == "m":
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def is_valid_mma_tiler_and_cluster_shape(
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        m_aligned: int,
+    ) -> bool:
+        """
+        Check if the mma tiler and cluster shape are valid
+
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+        :param m_aligned: The alignment requirement for group M dimension (default: 128)
+        :type m_aligned: int
+
+        :return: True if the mma tiler and cluster shape are valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        # Skip invalid mma tile shape
+        if not (
+            (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128])
+            or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256])
+        ):
+            is_valid = False
+        # Skip invalid mma tile n
+        if mma_tiler_mn[1] not in (64, 128, 192, 256):
+            is_valid = False
+        # Skip illegal cluster shape
+        if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0:
+            is_valid = False
+
+        # Skip invalid cluster shape
+        def is_power_of_2(x: int) -> bool:
+            return x > 0 and (x & (x - 1)) == 0
+
+        if (
+            cluster_shape_mn[0] * cluster_shape_mn[1] > 16
+            or cluster_shape_mn[0] <= 0
+            or cluster_shape_mn[1] <= 0
+            # Special cluster shape check for scale factor multicasts.
+            # Due to limited size of scale factors, we can't multicast among more than 4 CTAs.
+            or cluster_shape_mn[0] > 4
+            or cluster_shape_mn[1] > 4
+            or not is_power_of_2(cluster_shape_mn[0])
+            or not is_power_of_2(cluster_shape_mn[1])
+        ):
+            is_valid = False
+        cluster_tiler_m = (cluster_shape_mn[0] // (2 if use_2cta_instrs else 1)) * mma_tiler_mn[0]
+
+        # Skip invalid cluster tiler shape since contiguous layout can't handle oob access
+        # The contiguous layout means the aligned data is stored in a contiguous manner.
+        # It can't handle runtime oob when alignment is not align with the tile_M,
+        # since the problem shape of TMA store can't be changed at runtime.
+        if cluster_tiler_m not in [64, 128, 256]:
+            is_valid = False
+
+        # Check if m_aligned is a multiple of cluster_tiler_m
+        # This ensures that each group's M dimension (which is a multiple of m_aligned)
+        # won't be split across tiles, preventing a single tile from loading data
+        # from multiple groups (which would access wrong B matrix data)
+        if m_aligned % mma_tiler_mn[0] != 0:
+            is_valid = False
+
+        return is_valid
+
+    @staticmethod
+    def is_valid_tensor_alignment(
+        m: int,
+        n: int,
+        k: int,
+        l: int,  # noqa: E741
+        ab_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        a_major: str,
+        b_major: str,
+        c_major: str,
+    ) -> bool:
+        """
+        Check if the tensor alignment is valid
+
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+
+        :return: True if the problem shape is valid, False otherwise
+        :rtype: bool
+        """
+        is_valid = True
+
+        def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
+            major_mode_idx = 0 if is_mode0_major else 1
+            num_major_elements = tensor_shape[major_mode_idx]
+            num_contiguous_elements = 16 * 8 // dtype.width
+            return num_major_elements % num_contiguous_elements == 0
+
+        if (
+            not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
+            or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
+            or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
+        ):
+            is_valid = False
+        return is_valid
+
+    @staticmethod
+    def can_implement(
+        ab_dtype: Type[cutlass.Numeric],
+        sf_dtype: Type[cutlass.Numeric],
+        sf_vec_size: int,
+        acc_dtype: Type[cutlass.Numeric],
+        c_dtype: Type[cutlass.Numeric],
+        use_2cta_instrs: bool,
+        mma_tiler_mn: Tuple[int, int],
+        cluster_shape_mn: Tuple[int, int],
+        m: int,
+        n: int,
+        k: int,
+        l: int,  # noqa: E741
+        a_major: str,
+        b_major: str,
+        c_major: str,
+        m_aligned: int,
+    ) -> bool:
+        """
+        Check if the gemm can be implemented
+
+        :param ab_dtype: The data type of the A and B operands
+        :type ab_dtype: Type[cutlass.Numeric]
+        :param sf_dtype: The data type of the scale factor
+        :type sf_dtype: Type[cutlass.Numeric]
+        :param sf_vec_size: The vector size of the scale factor
+        :type sf_vec_size: int
+        :param acc_dtype: The data type of the accumulator
+        :type acc_dtype: Type[cutlass.Numeric]
+        :param c_dtype: The data type of the output tensor
+        :type c_dtype: Type[cutlass.Numeric]
+        :param use_2cta_instrs: Whether to use 2 CTA groups
+        :type use_2cta_instrs: bool
+        :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
+        :type mma_tiler_mn: Tuple[int, int]
+        :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
+        :type cluster_shape_mn: Tuple[int, int]
+        :param m: The number of rows in the A tensor
+        :type m: int
+        :param n: The number of columns in the B tensor
+        :type n: int
+        :param k: The number of columns in the A tensor
+        :type k: int
+        :param l: The number of columns in the C tensor
+        :type l: int
+        :param a_major: The major axis of the A tensor
+        :type a_major: str
+        :param b_major: The major axis of the B tensor
+        :type b_major: str
+        :param c_major: The major axis of the C tensor
+        :type c_major: str
+        :param m_aligned: The alignment requirement for group M dimension (default: 128)
+        :type m_aligned: int
+
+        :return: True if the gemm can be implemented, False otherwise
+        :rtype: bool
+        """
+        can_implement = True
+        # Skip unsupported types
+        if not Sm100BlockScaledPersistentGroupedGemmKernel.is_valid_dtypes_and_scale_factor_vec_size(
+            ab_dtype, sf_dtype, sf_vec_size, acc_dtype, c_dtype
+        ):
+            can_implement = False
+
+        # Skip unsupported layouts
+        if not Sm100BlockScaledPersistentGroupedGemmKernel.is_valid_layouts(
+            ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+
+        # Skip invalid mma tile shape and cluster shape
+        if not Sm100BlockScaledPersistentGroupedGemmKernel.is_valid_mma_tiler_and_cluster_shape(
+            use_2cta_instrs, mma_tiler_mn, cluster_shape_mn, m_aligned
+        ):
+            can_implement = False
+        # Skip illegal problem shape for load/store alignment
+        if not Sm100BlockScaledPersistentGroupedGemmKernel.is_valid_tensor_alignment(
+            m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major
+        ):
+            can_implement = False
+        # Skip unsupported A/B layout
+        if not (a_major == "k" and b_major == "k"):
+            can_implement = False
+        return can_implement
+
+    @cute.jit
+    def wrapper(
+        self,
+        a_ptr: cute.Pointer,
+        b_ptr: cute.Pointer,
+        a_sf_ptr: cute.Pointer,
+        b_sf_ptr: cute.Pointer,
+        c_ptr: cute.Pointer,
+        alpha_ptr: cute.Pointer,
+        tile_idx_to_group_idx_ptr: cute.Pointer,
+        num_non_exiting_tiles_ptr: cute.Pointer,
+        m: int,
+        n: int,
+        k: int,
+        l: cutlass.Constexpr,  # noqa: E741
+        tile_size: cutlass.Constexpr,
+        scaling_vector_size: cutlass.Constexpr,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+        epilogue_op: cutlass.Constexpr = lambda x: x,
+    ):
+        scale_k = k // scaling_vector_size
+        num_tiles = m // tile_size
+        a = cute.make_tensor(a_ptr, layout=cute.make_ordered_layout((m, k, 1), order=(1, 0, 2)))
+        b = cute.make_tensor(b_ptr, layout=cute.make_ordered_layout((n, k, l), order=(1, 0, 2)))
+        a_sf = cute.make_tensor(
+            a_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, m // 128, 4, scale_k // 4, 1), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+        b_sf = cute.make_tensor(
+            b_sf_ptr,
+            layout=cute.make_ordered_layout(
+                (32, 4, n // 128, 4, scale_k // 4, l), order=(2, 1, 4, 0, 3, 5)
+            ),
+        )
+        c = cute.make_tensor(c_ptr, layout=cute.make_ordered_layout((m, n, 1), order=(1, 0, 2)))
+        alpha = cute.make_tensor(alpha_ptr, layout=cute.make_layout((l,)))
+        tile_idx_to_group_idx = cute.make_tensor(
+            tile_idx_to_group_idx_ptr, layout=cute.make_layout((num_tiles,))
+        )
+        tile_idx_to_group_idx.mark_layout_dynamic()
+        num_non_exiting_tiles = cute.make_tensor(
+            num_non_exiting_tiles_ptr, layout=cute.make_layout((1,))
+        )
+        return self(
+            a,
+            b,
+            c,
+            a_sf,
+            b_sf,
+            tile_idx_to_group_idx,
+            num_non_exiting_tiles,
+            alpha,
+            max_active_clusters=max_active_clusters,
+            stream=stream,
+            epilogue_op=epilogue_op,
+        )
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 2d6f0db8699..bc9d097d81c 100755
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -701,8 +701,7 @@ def noaux_tc(self, logits, e_score_correction_bias):
             new_mask.scatter_(-1, topk_idx, 1)
             scores = scores * new_mask
             score_sum = torch.sum(scores, dim=-1, keepdim=True) + 1e-20
-            scores = scores / score_sum * \
-                self.routed_scaling_factor
+            scores = scores / score_sum * self.routed_scaling_factor
             topk_values, topk_indices = torch.topk(scores,
                                                    k=self.top_k,
                                                    dim=-1,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
index b90fc35e680..f3314ffaee9 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@@ -6,13 +6,15 @@
 
 from tensorrt_llm._utils import is_sm_100f
 
+from ...distributed import allgather
 from ...model_config import ModelConfig
-from ...utils import AuxStreamType, Fp4QuantizedTensor
+from ...utils import AuxStreamType, Fp4QuantizedTensor, ceil_div
 from .fused_moe_cutlass import CutlassFusedMoE
 from .quantization import MoEWeightLoadingMode
 from .routing import BaseMoeRoutingMethod
 
 
+@torch.compile(options={"max-autotune": True})
 def swiglu_fused_moe(x):
     x, gate = x.chunk(2, dim=-1)
     return F.silu(gate) * x
@@ -88,6 +90,65 @@ def pad_and_multiply(scale, tensor):
     return ref
 
 
+def cute_dsl_nvfp4_grouped_gemm_ref(
+    a: torch.Tensor,
+    b: torch.Tensor,
+    a_sf: torch.Tensor,
+    b_sf: torch.Tensor,
+    alpha: torch.Tensor,
+    tile_idx_to_group_idx: torch.Tensor,
+    num_non_exiting_tiles: torch.Tensor,
+    tile_size: int,
+    output_dtype: torch.dtype,
+    scaling_vector_size: int = 16,
+):
+    assert a.dtype == torch.float4_e2m1fn_x2
+    assert a.dim() == 2
+    assert b.dtype == torch.float4_e2m1fn_x2
+    assert b.dim() == 3
+    assert a_sf.dtype == torch.uint8
+    assert a_sf.dim() == 1
+    assert b_sf.dtype == torch.uint8
+    assert b_sf.dim() == 3
+    assert alpha.dtype == torch.float32
+    assert alpha.dim() == 1
+
+    m, k = a.size(0), a.size(1) * 2
+    l, n = b.size(0), b.size(1)
+    scale_k = k // scaling_vector_size
+    assert m % tile_size == 0
+    assert k % (scaling_vector_size * 4) == 0
+    assert b.size(2) * 2 == k
+    assert a_sf.size(0) == m * scale_k
+    assert b_sf.size(0) == l
+    assert b_sf.size(1) == n
+    assert b_sf.size(2) == scale_k
+    assert alpha.size(0) == l
+
+    num_tiles = m // tile_size
+    assert tile_idx_to_group_idx.dtype == torch.int32
+    assert tile_idx_to_group_idx.size() == (num_tiles, )
+    assert num_non_exiting_tiles.dtype == torch.int32
+    assert num_non_exiting_tiles.size() == (1, )
+
+    num_tiles_per_expert = torch.bincount(
+        tile_idx_to_group_idx[:num_non_exiting_tiles[0].item()], minlength=l)
+    offsets = [0] + num_tiles_per_expert.cumsum(dim=0).tolist()
+
+    ref = torch.empty(m, n, dtype=output_dtype, device="cuda")
+    for i, (start, end) in enumerate(zip(offsets[:-1], offsets[1:])):
+        if end <= start:
+            continue
+        a_sliced = a[start * tile_size:end * tile_size]
+        a_sf_sliced = a_sf[start * tile_size * k // scaling_vector_size:end *
+                           tile_size * k // scaling_vector_size]
+        ref[start * tile_size:end * tile_size] = torch.ops.trtllm.nvfp4_gemm(
+            a_sliced.view(torch.uint8), b[i].view(torch.uint8), a_sf_sliced,
+            b_sf[i], alpha[i], output_dtype)
+
+    return ref
+
+
 class CuteDslFusedMoE(CutlassFusedMoE):
     """
     Python Flow of Fused Mixture of Experts (MoE) Layer.
@@ -140,7 +201,7 @@ def __init__(
             layer_idx=layer_idx,
         )
 
-    def forward_chunk(
+    def forward_chunk_unquantized(
             self,
             x: Union[torch.Tensor, Fp4QuantizedTensor],
             router_logits: torch.Tensor,
@@ -149,11 +210,24 @@ def forward_chunk(
             use_dp_padding: Optional[bool] = None,
             repeating_info: tuple = (True, True),
     ) -> torch.Tensor:
-        if isinstance(x, Fp4QuantizedTensor):
-            assert output_dtype is not None
-            output_dtype = output_dtype
-        else:
-            output_dtype = x.dtype
+        assert not self.has_any_quant
+        return super().forward_chunk(x,
+                                     router_logits,
+                                     output_dtype=output_dtype,
+                                     all_rank_num_tokens=all_rank_num_tokens,
+                                     use_dp_padding=use_dp_padding,
+                                     repeating_info=repeating_info)
+
+    def forward_chunk_fp8_block_scales(
+            self,
+            x: Union[torch.Tensor, Fp4QuantizedTensor],
+            router_logits: torch.Tensor,
+            output_dtype: Optional[torch.dtype] = None,
+            all_rank_num_tokens: Optional[List[int]] = None,
+            use_dp_padding: Optional[bool] = None,
+            repeating_info: tuple = (True, True),
+    ) -> torch.Tensor:
+        assert self.has_deepseek_fp8_block_scales
 
         # apply routing
         token_selected_experts, token_final_scales = self.routing_method.apply(
@@ -172,17 +246,7 @@ def forward_chunk(
             # TODO: remove this once we have correct fusedmoe kernel ready
             token_final_scales = None
 
-        # quantize inputs
-        use_deepseek_fp8_block_scale = False
         weight_dtype = self.w3_w1_weight.dtype
-        x_sf = None
-        if self.has_any_quant:
-            if self.has_deepseek_fp8_block_scales:
-                use_deepseek_fp8_block_scale = True
-            else:
-                raise ValueError(
-                    f"unsupported quantization mode for CUTEDSL backend: {self.quant_config.quant_mode}"
-                )
 
         (
             permuted_row_to_unpermuted_row_tensor,
@@ -198,7 +262,7 @@ def forward_chunk(
             None,  # w3_w1_weight.view(weight_dtype),
             None,  # w2_weight.view(weight_dtype),
             None,  # quant_scales,
-            input_sf=x_sf,
+            input_sf=None,
             num_experts_on_rank=self.expert_size_per_partition,
             tp_size=self.tp_size,
             tp_rank=self.tp_rank,
@@ -207,7 +271,7 @@ def forward_chunk(
             cluster_size=self.cluster_size,
             cluster_rank=self.cluster_rank,
             min_latency_mode=False,
-            use_fp8_block_scaling=use_deepseek_fp8_block_scale,
+            use_fp8_block_scaling=True,
         )
         act_input_fp8, act_input_sf = torch.ops.trtllm.fp8_quantize_1x128(
             permuted_data_tensor)
@@ -227,7 +291,7 @@ def forward_chunk(
             b_sf=self.quant_scales[1],
             offset_array=expert_first_token_offset_tensor,
         )
-        final_hidden_states = torch.ops.trtllm.moe_finalize_scale_op(
+        h4 = torch.ops.trtllm.moe_finalize_scale_op(
             h3,
             None,  # biases
             token_final_scales,
@@ -246,5 +310,164 @@ def forward_chunk(
             self.ep_size,
             self.ep_rank,
         )
+        return h4
 
-        return final_hidden_states
+    def forward_chunk_nvfp4(
+            self,
+            x: Union[torch.Tensor, Fp4QuantizedTensor],
+            router_logits: torch.Tensor,
+            output_dtype: Optional[torch.dtype] = None,
+            all_rank_num_tokens: Optional[List[int]] = None,
+            use_dp_padding: Optional[bool] = None,
+            repeating_info: tuple = (True, True),
+    ) -> torch.Tensor:
+        assert self.has_nvfp4
+
+        if isinstance(x, Fp4QuantizedTensor):
+            assert output_dtype is not None
+        else:
+            output_dtype = x.dtype
+
+        # apply routing
+        token_selected_experts, token_final_scales = self.routing_method.apply(
+            router_logits)
+        assert token_selected_experts.shape[
+            1] == self.routing_method.experts_per_token
+        assert token_selected_experts.shape == token_final_scales.shape
+        assert token_selected_experts.shape[0] == router_logits.shape[0]
+        assert token_final_scales.dtype == torch.float32
+        assert token_selected_experts.dtype == torch.int32
+
+        run_post_quant_allgather = self.use_dp and self.parallel_size > 1
+        if run_post_quant_allgather:
+            if isinstance(x, Fp4QuantizedTensor):
+                assert not x.is_sf_swizzled, "Fp4QuantizedTensor should not be swizzled before communication"
+                x, x_sf = x.fp4_tensor, x.scaling_factor
+            else:
+                x, x_sf = torch.ops.trtllm.fp4_quantize(
+                    x, self.fc31_input_scale, self.scaling_vector_size, False,
+                    False)
+            # note: we use uint8 to store 2 fp4 values
+            x_row, x_col = x.size(0), x.size(1) * 2
+        else:
+            if not isinstance(x, Fp4QuantizedTensor):
+                x, x_sf = torch.ops.trtllm.fp4_quantize(
+                    x, self.fc31_input_scale, self.scaling_vector_size, False,
+                    False)
+
+        if run_post_quant_allgather:
+            # Original allgather logic
+            if x_sf is not None:
+                x_sf = x_sf.view(x_row, ceil_div(x_col,
+                                                 self.scaling_vector_size))
+                assert x_sf.dim(
+                ) == 2, "The hidden states scaling factor should be 2D tensor before allgather"
+
+            x, x_sf, token_selected_experts, token_final_scales = allgather(
+                [x, x_sf, token_selected_experts, token_final_scales],
+                self.mapping,
+                dim=0,
+                sizes=None if use_dp_padding else all_rank_num_tokens)
+
+        tile_size = 128
+        tile_idx_to_expert_idx, tile_idx_to_mn_limit, expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, total_num_padded_tokens, num_non_exiting_tiles = torch.ops.trtllm.moe_sort(
+            token_selected_experts=token_selected_experts,
+            token_final_scales=token_final_scales,
+            num_experts=self.num_slots,
+            top_k=self.routing_method.experts_per_token,
+            local_expert_offset=self.slot_start,
+            local_num_experts=self.expert_size_per_partition,
+            tile_tokens_dim=tile_size,
+        )
+
+        x, x_sf = torch.ops.trtllm.moe_permute(
+            input=x.view(torch.float4_e2m1fn_x2),
+            input_sf=x_sf,
+            tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+            permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx,
+            num_non_exiting_tiles=num_non_exiting_tiles,
+            tile_tokens_dim=tile_size,
+            top_k=self.routing_method.experts_per_token,
+        )
+        x = torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_blackwell(
+            input=x.view(torch.float4_e2m1fn_x2),
+            weight=self.w3_w1_weight.view(torch.float4_e2m1fn_x2),
+            input_scale=x_sf.view(torch.uint8),
+            weight_scale=self.quant_scales.fc1_weight_block.view(torch.uint8),
+            alpha=self.quant_scales.fc1_global,
+            tile_idx_to_group_idx=tile_idx_to_expert_idx,
+            num_non_exiting_tiles=num_non_exiting_tiles,
+            num_experts=self.num_slots,
+            top_k=self.routing_method.experts_per_token,
+            num_local_experts=self.expert_size_per_partition,
+            local_expert_offset=self.slot_start,
+            tile_size=tile_size,
+            output_dtype=output_dtype,
+        )
+        x, x_sf = torch.ops.trtllm.moe_swiglu_nvfp4_quantize(
+            input=x,
+            global_sf=self.fc2_input_scale,
+            tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+            num_non_exiting_tiles=num_non_exiting_tiles,
+            tile_tokens_dim=tile_size,
+        )
+        x = torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_blackwell(
+            input=x.view(torch.float4_e2m1fn_x2),
+            weight=self.w2_weight.view(torch.float4_e2m1fn_x2),
+            input_scale=x_sf.view(torch.uint8),
+            weight_scale=self.quant_scales.fc2_weight_block.view(torch.uint8),
+            alpha=self.quant_scales.fc2_global,
+            tile_idx_to_group_idx=tile_idx_to_expert_idx,
+            num_non_exiting_tiles=num_non_exiting_tiles,
+            num_experts=self.num_slots,
+            top_k=self.routing_method.experts_per_token,
+            num_local_experts=self.expert_size_per_partition,
+            local_expert_offset=self.slot_start,
+            tile_size=tile_size,
+            output_dtype=output_dtype,
+        )
+        x = torch.ops.trtllm.moe_unpermute(
+            permuted_input=x,
+            expanded_idx_to_permuted_idx=expanded_idx_to_permuted_idx,
+            topk_scales=token_final_scales,
+        )
+        return x
+
+    def forward_chunk(
+            self,
+            x: Union[torch.Tensor, Fp4QuantizedTensor],
+            router_logits: torch.Tensor,
+            output_dtype: Optional[torch.dtype] = None,
+            all_rank_num_tokens: Optional[List[int]] = None,
+            use_dp_padding: Optional[bool] = None,
+            repeating_info: tuple = (True, True),
+    ) -> torch.Tensor:
+        if self.has_any_quant:
+            if self.has_nvfp4:
+                return self.forward_chunk_nvfp4(
+                    x,
+                    router_logits,
+                    output_dtype=output_dtype,
+                    all_rank_num_tokens=all_rank_num_tokens,
+                    use_dp_padding=use_dp_padding,
+                    repeating_info=repeating_info)
+            elif self.has_deepseek_fp8_block_scales:
+                return self.forward_chunk_fp8_block_scales(
+                    x,
+                    router_logits,
+                    output_dtype=output_dtype,
+                    all_rank_num_tokens=all_rank_num_tokens,
+                    use_dp_padding=use_dp_padding,
+                    repeating_info=repeating_info)
+            else:
+                raise ValueError(
+                    f"unsupported quantization mode for CUTEDSL backend: {self.quant_config.quant_mode}"
+                )
+        else:
+            return self.forward_chunk_unquantized(
+                x,
+                router_logits,
+                output_dtype=output_dtype,
+                all_rank_num_tokens=all_rank_num_tokens,
+                use_dp_padding=use_dp_padding,
+                repeating_info=repeating_info)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
index 686a793aa42..3d79e04c1c6 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -303,7 +303,6 @@ def forward_chunk(
     ) -> torch.Tensor:
         if isinstance(x, Fp4QuantizedTensor):
             assert output_dtype is not None
-            output_dtype = output_dtype
         else:
             output_dtype = x.dtype
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
index 69968a8be4f..dc7373aa189 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
@@ -474,7 +474,6 @@ def forward_chunk(
     ) -> torch.Tensor:
         if isinstance(x, Fp4QuantizedTensor):
             assert output_dtype is not None
-            output_dtype = output_dtype
         else:
             output_dtype = x.dtype
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
index 5dc860fa16d..00e481e9e07 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -400,7 +400,6 @@ def forward_chunk(
         all_rank_max_num_tokens = max(all_rank_num_tokens)
         if isinstance(x, Fp4QuantizedTensor):
             assert output_dtype is not None
-            output_dtype = output_dtype
         else:
             output_dtype = x.dtype
 
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index c1d501b0143..b28b0954075 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1684,13 +1684,15 @@ def test_nvfp4_4gpus_online_eplb(self, moe_backend, fp8kv):
                            (False, False, False, True),
                            (True, False, True, True), (True, True, True, True)])
     @parametrize_with_ids("mtp_nextn", [0, 2])
-    @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM"])
+    @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"])
     def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
                    torch_compile, mtp_nextn, moe_backend):
         if moe_backend == "TRTLLM" and (get_sm_version() == 120
                                         or get_sm_version() == 121):
             pytest.skip(
                 "MOE TRTLLM backend does not support SM version 120 or 121")
+        if moe_backend == "CUTEDSL" and get_sm_version() != 100:
+            pytest.skip(f"{moe_backend} backend supports SM 100 only")
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
@@ -1777,7 +1779,7 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph,
                                                          (2, 2, 1), (1, 4, 1)],
                              ids=["tp4", "ep4", "tp2pp2", "pp4"])
     @parametrize_with_ids("mtp_nextn", [0, 2])
-    @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM"])
+    @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"])
     def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                          overlap_scheduler, tp_size, pp_size, ep_size,
                          torch_compile, mtp_nextn, moe_backend):
@@ -1787,6 +1789,9 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                                         or get_sm_version() == 121):
             pytest.skip(
                 "MOE TRTLLM backend does not support SM version 120 or 121")
+        if moe_backend == "CUTEDSL" and get_sm_version() != 100:
+            pytest.skip(f"{moe_backend} backend supports SM 100 only")
+
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
         torch_compile_config = TorchCompileConfig(
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index 0972c6b0b4b..65abab1acf5 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -27,6 +27,7 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] ISOLATION
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTEDSL-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True]
@@ -148,4 +149,5 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTEDSL-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 312ae9963ce..1c2428bba60 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -36,6 +36,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTEDSL-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
@@ -168,6 +169,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTEDSL-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding_4gpus[xgrammar-mtp_nextn=0]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False]
diff --git a/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py b/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py
new file mode 100644
index 00000000000..d748c6ac9f8
--- /dev/null
+++ b/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py
@@ -0,0 +1,431 @@
+import pytest
+import torch
+
+from tensorrt_llm._torch.custom_ops.cute_dsl_custom_ops import GroupedGemmInputsHelper
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_cute_dsl import cute_dsl_nvfp4_grouped_gemm_ref
+from tensorrt_llm._torch.utils import unswizzle_sf
+from tensorrt_llm._utils import get_sm_version
+
+
+def swiglu_ref(x: torch.Tensor) -> torch.Tensor:
+    x, gate = x.chunk(2, dim=-1)
+    return x * torch.nn.functional.silu(gate)
+
+
+@pytest.mark.parametrize("tile_size", [128, 256])
+@pytest.mark.parametrize("ep_size", [1, 8, 32])
+@pytest.mark.parametrize("top_k", [1, 2, 6, 8])
+def test_grouped_gemm_inputs_helper(top_k: int, ep_size: int, tile_size: int):
+    num_experts = 256
+    num_local_experts = num_experts // ep_size
+
+    helper = GroupedGemmInputsHelper(num_experts, top_k, num_local_experts, 0, tile_size)
+    max_num_tokens = 8192
+    num_tokens_list = list(range(1, max_num_tokens + 1))
+    max_num_permuted_tokens_list = [helper.get_max_num_permuted_tokens(x) for x in num_tokens_list]
+    num_inferred_tokens_list = [helper.infer_num_tokens(x) for x in max_num_permuted_tokens_list]
+
+    for i in range(max_num_tokens):
+        assert num_inferred_tokens_list[i] >= num_tokens_list[i]
+        assert num_inferred_tokens_list[i] < num_tokens_list[i] + tile_size
+        if i > 0:
+            assert num_inferred_tokens_list[i] >= num_inferred_tokens_list[i - 1]
+
+    buckets = helper.gen_tuning_buckets(max_num_permuted_tokens_list[-1])
+    assert set([helper.map_to_tuning_buckets(x) for x in max_num_permuted_tokens_list]) == set(
+        buckets
+    )
+
+
+@pytest.mark.parametrize("tile_size", [128, 256])
+@pytest.mark.parametrize("ep_size", [1, 8, 32])
+@pytest.mark.parametrize("top_k", [1, 2, 8])
+@pytest.mark.parametrize("num_tokens", [128, 515, 1024, 8192])
+def test_moe_sort(num_tokens: int, top_k: int, ep_size: int, tile_size: int):
+    num_experts = 256
+    num_local_experts = num_experts // ep_size
+
+    routing_logits = torch.randn(num_tokens, num_experts, device="cuda")
+    token_final_scales, token_selected_experts = routing_logits.topk(top_k, dim=-1)
+    token_selected_experts = token_selected_experts.to(torch.int32)
+    token_final_scales = token_final_scales.softmax(dim=-1).to(torch.bfloat16)
+
+    (
+        tile_idx_to_group_idx,
+        tile_idx_to_mn_limit,
+        expanded_idx_to_permuted_idx,
+        permuted_idx_to_expanded_idx,
+        total_num_padded_tokens,
+        num_non_exiting_tiles,
+    ) = torch.ops.trtllm.moe_sort(
+        token_selected_experts=token_selected_experts,
+        token_final_scales=token_final_scales,
+        num_experts=num_experts,
+        top_k=top_k,
+        local_expert_offset=0,
+        local_num_experts=num_local_experts,
+        tile_tokens_dim=tile_size,
+    )
+
+    num_tokens_per_expert = torch.bincount(token_selected_experts.flatten(), minlength=num_experts)
+    num_tokens_per_expert = num_tokens_per_expert[:num_local_experts]
+    num_tiles_per_expert = (num_tokens_per_expert + tile_size - 1) // tile_size
+    num_tokens_per_expert = num_tokens_per_expert.cpu()
+    num_tiles_per_expert = num_tiles_per_expert.cpu()
+
+    helper = GroupedGemmInputsHelper(num_experts, top_k, num_local_experts, 0, tile_size)
+    max_num_tiles = helper.get_max_num_tiles(num_tokens)
+    max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens)
+    num_valid_tiles = num_tiles_per_expert.sum().item()
+    num_valid_permuted_tokens = num_valid_tiles * tile_size
+    assert 0 <= num_valid_tiles <= max_num_tiles
+    assert 0 <= num_valid_permuted_tokens <= max_num_permuted_tokens
+
+    tile_idx_to_group_idx = tile_idx_to_group_idx.cpu()
+    tile_idx_to_mn_limit = tile_idx_to_mn_limit.cpu()
+    assert tile_idx_to_group_idx.size() == (max_num_tiles,)
+    assert tile_idx_to_mn_limit.size() == (max_num_tiles,)
+    tile_idx = 0
+    for expert_idx in range(num_local_experts):
+        num_remaining_tokens = num_tokens_per_expert[expert_idx].item()
+        for i in range(num_tiles_per_expert[expert_idx].item()):
+            mn_limit = tile_idx * tile_size
+            if i < num_tiles_per_expert[expert_idx].item() - 1:
+                assert num_remaining_tokens > tile_size
+                num_remaining_tokens -= tile_size
+                mn_limit += tile_size
+            else:
+                assert 0 < num_remaining_tokens <= tile_size
+                mn_limit += num_remaining_tokens
+            assert tile_idx_to_group_idx[tile_idx].item() == expert_idx
+            assert tile_idx_to_mn_limit[tile_idx].item() == mn_limit
+            tile_idx += 1
+
+    token_selected_experts = token_selected_experts.cpu()
+    expanded_idx_to_permuted_idx = expanded_idx_to_permuted_idx.cpu()
+    permuted_idx_to_expanded_idx = permuted_idx_to_expanded_idx.cpu()
+    assert expanded_idx_to_permuted_idx.size() == (num_tokens, top_k)
+    assert permuted_idx_to_expanded_idx.size() == (max_num_permuted_tokens,)
+    for i in range(num_tokens):
+        for k in range(top_k):
+            expert_idx = token_selected_experts[i, k].item()
+            expanded_idx = i * top_k + k
+            permuted_idx = expanded_idx_to_permuted_idx[i, k].item()
+            if expert_idx >= num_local_experts:
+                assert permuted_idx == -1
+            else:
+                assert permuted_idx >= 0
+                assert permuted_idx_to_expanded_idx[permuted_idx].item() == expanded_idx
+                tile_idx = permuted_idx // tile_size
+                assert tile_idx_to_group_idx[tile_idx].item() == expert_idx
+
+    for i in range(num_valid_permuted_tokens):
+        tile_idx = i // tile_size
+        if i < tile_idx_to_mn_limit[tile_idx].item():
+            expanded_idx = permuted_idx_to_expanded_idx[i].item()
+            token_idx = expanded_idx // top_k
+            topk_idx = expanded_idx % top_k
+            assert expanded_idx_to_permuted_idx[token_idx, topk_idx].item() == i
+
+    assert total_num_padded_tokens.size() == (1,)
+    assert total_num_padded_tokens[0].item() == num_valid_permuted_tokens
+    assert num_non_exiting_tiles.size() == (1,)
+    assert num_non_exiting_tiles[0].item() == num_valid_tiles
+
+
+@pytest.mark.parametrize("tile_size", [128, 256])
+@pytest.mark.parametrize("top_k", [1, 2, 8])
+@pytest.mark.parametrize("num_tokens", [128, 515, 1024])
+@pytest.mark.parametrize("dtype", ["bfloat16", "float16", "float8", "float4"])
+def test_moe_permute(dtype: str, num_tokens: int, top_k: int, tile_size: int):
+    sf_vec_size = 16
+    hidden_size = 4096
+    num_experts = 256
+    num_local_experts = num_experts // 32
+    x = torch.randint(-100, 100, (num_tokens, hidden_size), dtype=torch.int32, device="cuda")
+    x_sf = None
+    if dtype == "float4":
+        x = x[:, : hidden_size // 2].to(torch.int8).view(torch.float4_e2m1fn_x2)
+        x_sf = torch.randint(
+            -100, 100, (num_tokens, hidden_size // sf_vec_size), dtype=torch.int32, device="cuda"
+        )
+        x_sf = x_sf.to(torch.float8_e4m3fn).view(torch.uint8)
+    elif dtype == "float8":
+        x = x.to(torch.float8_e4m3fn)
+    else:
+        x = x.to(getattr(torch, dtype))
+
+    helper = GroupedGemmInputsHelper(num_experts, top_k, num_local_experts, 0, tile_size)
+    max_num_tiles = helper.get_max_num_tiles(num_tokens)
+    max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens)
+    tile_idx_to_mn_limit = (
+        torch.arange(1, max_num_tiles + 1, dtype=torch.int32, device="cuda") * tile_size
+    )
+    permuted_idx_to_expanded_idx = torch.randint(
+        0, num_tokens * top_k, (max_num_permuted_tokens,), dtype=torch.int32, device="cuda"
+    )
+    num_non_exiting_tiles_val = (num_tokens * top_k + tile_size - 1) // tile_size
+    num_non_exiting_tiles = torch.tensor(
+        [num_non_exiting_tiles_val], dtype=torch.int32, device="cuda"
+    )
+    permuted_x, permuted_sf = torch.ops.trtllm.moe_permute(
+        x,
+        x_sf,
+        tile_idx_to_mn_limit,
+        permuted_idx_to_expanded_idx,
+        num_non_exiting_tiles,
+        tile_size,
+        top_k,
+    )
+    if dtype == "float4":
+        assert permuted_sf is not None
+        permuted_sf = unswizzle_sf(permuted_sf, max_num_permuted_tokens, hidden_size, sf_vec_size)
+    else:
+        assert permuted_sf is None
+
+    for i in range(max_num_permuted_tokens):
+        if i >= num_non_exiting_tiles_val * tile_size:
+            break
+        expanded_idx = permuted_idx_to_expanded_idx[i].item()
+        if expanded_idx < 0:
+            continue
+        token_idx = expanded_idx // top_k
+        if dtype == "float4":
+            torch.testing.assert_close(
+                permuted_x[i].view(torch.uint8), x[token_idx].view(torch.uint8)
+            )
+            torch.testing.assert_close(permuted_sf[i], x_sf[token_idx])
+        else:
+            torch.testing.assert_close(permuted_x[i], x[token_idx])
+
+
+@pytest.mark.parametrize("tile_size", [128, 256])
+@pytest.mark.parametrize("top_k", [1, 2, 8])
+@pytest.mark.parametrize("num_tokens", [128, 515, 1024])
+@pytest.mark.parametrize("dtype", ["bfloat16", "float16"])
+def test_moe_unpermute(dtype: str, num_tokens: int, top_k: int, tile_size: int):
+    dtype = getattr(torch, dtype)
+    hidden_size = 4096
+    num_experts = 256
+    num_local_experts = num_experts // 32
+    helper = GroupedGemmInputsHelper(num_experts, top_k, num_local_experts, 0, tile_size)
+    max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens)
+    permuted_x = torch.randint(
+        -100, 100, (max_num_permuted_tokens, hidden_size), dtype=torch.int32, device="cuda"
+    ).to(dtype)
+
+    expanded_idx_to_permuted_idx = torch.randint(
+        0, max_num_permuted_tokens, (num_tokens, top_k), dtype=torch.int32, device="cuda"
+    )
+    topk_scales = torch.randn(num_tokens, top_k, dtype=torch.float32, device="cuda").softmax(dim=-1)
+    x = torch.ops.trtllm.moe_unpermute(permuted_x, expanded_idx_to_permuted_idx, topk_scales)
+
+    x_ref = (
+        (permuted_x[expanded_idx_to_permuted_idx] * topk_scales.unsqueeze(-1)).sum(dim=1).to(dtype)
+    )
+    torch.testing.assert_close(x, x_ref)
+
+
+@pytest.mark.parametrize("tile_size", [128, 256])
+@pytest.mark.parametrize("top_k", [1, 2, 8])
+@pytest.mark.parametrize("num_tokens", [128, 515, 1024])
+@pytest.mark.parametrize("dtype", ["bfloat16", "float16"])
+def test_moe_swiglu(dtype: str, num_tokens: int, top_k: int, tile_size: int):
+    dtype = getattr(torch, dtype)
+    interm_size = 4096
+    num_experts = 256
+    num_local_experts = num_experts // 32
+    helper = GroupedGemmInputsHelper(num_experts, top_k, num_local_experts, 0, tile_size)
+    max_num_tiles = helper.get_max_num_tiles(num_tokens)
+    max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens)
+
+    x = torch.randint(
+        -100, 100, (max_num_permuted_tokens, interm_size * 2), dtype=torch.int32, device="cuda"
+    ).to(dtype)
+    tile_idx_to_mn_limit = (
+        torch.arange(1, max_num_tiles + 1, dtype=torch.int32, device="cuda") * tile_size
+    )
+    num_non_exiting_tiles_val = (num_tokens * top_k + tile_size - 1) // tile_size
+    num_non_exiting_tiles = torch.tensor(
+        [num_non_exiting_tiles_val], dtype=torch.int32, device="cuda"
+    )
+    num_permuted_tokens = num_non_exiting_tiles_val * tile_size
+
+    y = torch.ops.trtllm.moe_swiglu(x, tile_idx_to_mn_limit, num_non_exiting_tiles, tile_size)
+    y_ref = swiglu_ref(x)
+    torch.testing.assert_close(y[:num_permuted_tokens], y_ref[:num_permuted_tokens])
+
+
+@pytest.mark.skipif(get_sm_version() != 100, reason="This test is only supported on SM 100 GPUs")
+@pytest.mark.parametrize("tile_size", [128, 256])
+@pytest.mark.parametrize("top_k", [1, 2, 8])
+@pytest.mark.parametrize("num_tokens", [128, 515, 1024])
+@pytest.mark.parametrize("dtype", ["bfloat16", "float16"])
+def test_moe_swiglu_nvfp4_quantize(dtype: str, num_tokens: int, top_k: int, tile_size: int):
+    dtype = getattr(torch, dtype)
+    sf_vec_size = 16
+    interm_size = 4096
+    num_experts = 256
+    num_local_experts = num_experts // 32
+    helper = GroupedGemmInputsHelper(num_experts, top_k, num_local_experts, 0, tile_size)
+    max_num_tiles = helper.get_max_num_tiles(num_tokens)
+    max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens)
+
+    x = torch.randint(
+        -100, 100, (max_num_permuted_tokens, interm_size * 2), dtype=torch.int32, device="cuda"
+    ).to(dtype)
+    tile_idx_to_mn_limit = (
+        torch.arange(1, max_num_tiles + 1, dtype=torch.int32, device="cuda") * tile_size
+    )
+    num_non_exiting_tiles_val = (num_tokens * top_k + tile_size - 1) // tile_size
+    num_non_exiting_tiles = torch.tensor(
+        [num_non_exiting_tiles_val], dtype=torch.int32, device="cuda"
+    )
+    num_permuted_tokens = num_non_exiting_tiles_val * tile_size
+
+    global_sf = swiglu_ref(x).abs().max().float() / (448 * 6)
+    global_sf = 1 / global_sf
+    y, y_sf = torch.ops.trtllm.moe_swiglu_nvfp4_quantize(
+        x, global_sf, tile_idx_to_mn_limit, num_non_exiting_tiles, tile_size
+    )
+    y_ref, y_sf_ref = torch.ops.trtllm.fp4_quantize(swiglu_ref(x), global_sf, 16, False)
+    match_ratio = (
+        y[:num_permuted_tokens].view(torch.uint8) == y_ref[:num_permuted_tokens]
+    ).sum().item() / y[:num_permuted_tokens].numel()
+    assert match_ratio > 0.999
+
+    num_sf_elements = num_permuted_tokens * interm_size // sf_vec_size
+    match_ratio = (
+        y_sf[:num_sf_elements] == y_sf_ref[:num_sf_elements]
+    ).sum().item() / num_sf_elements
+    assert match_ratio > 0.999
+
+
+@pytest.mark.parametrize("tile_size", [128, 256])
+@pytest.mark.parametrize("top_k", [1, 2, 8])
+@pytest.mark.parametrize("num_tokens", [128, 515, 1024])
+@pytest.mark.parametrize("dtype", ["bfloat16", "float16"])
+def test_moe_gelu(dtype: str, num_tokens: int, top_k: int, tile_size: int):
+    dtype = getattr(torch, dtype)
+    interm_size = 4096
+    num_experts = 256
+    num_local_experts = num_experts // 32
+    helper = GroupedGemmInputsHelper(num_experts, top_k, num_local_experts, 0, tile_size)
+    max_num_tiles = helper.get_max_num_tiles(num_tokens)
+    max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens)
+
+    x = torch.randint(
+        -100, 100, (max_num_permuted_tokens, interm_size), dtype=torch.int32, device="cuda"
+    ).to(dtype)
+    tile_idx_to_mn_limit = (
+        torch.arange(1, max_num_tiles + 1, dtype=torch.int32, device="cuda") * tile_size
+    )
+    num_non_exiting_tiles_val = (num_tokens * top_k + tile_size - 1) // tile_size
+    num_non_exiting_tiles = torch.tensor(
+        [num_non_exiting_tiles_val], dtype=torch.int32, device="cuda"
+    )
+    num_permuted_tokens = num_non_exiting_tiles_val * tile_size
+
+    y = torch.ops.trtllm.moe_gelu(x, tile_idx_to_mn_limit, num_non_exiting_tiles, tile_size)
+    y_ref = torch.nn.functional.gelu(x)
+    torch.testing.assert_close(y[:num_permuted_tokens], y_ref[:num_permuted_tokens])
+
+
+@pytest.mark.skipif(get_sm_version() != 100, reason="This test is only supported on SM 100 GPUs")
+@pytest.mark.parametrize("tile_size", [128])
+@pytest.mark.parametrize("ep_size", [1, 8, 32])
+@pytest.mark.parametrize("top_k", [1, 2, 8])
+@pytest.mark.parametrize("num_tokens", [128, 515, 1024, 8192])
+def test_nvfp4_grouped_gemm_blackwell(num_tokens: int, top_k: int, ep_size: int, tile_size: int):
+    sf_vec_size = 16
+    hidden_size = 4096
+    inter_size = 8192
+    num_experts = 256
+    num_local_experts = num_experts // ep_size
+
+    helper = GroupedGemmInputsHelper(num_experts, top_k, num_local_experts, 0, tile_size)
+    max_num_tiles = helper.get_max_num_tiles(num_tokens)
+    max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens)
+    routing_logits = torch.randn(num_tokens, num_experts, device="cuda")
+    _, token_selected_experts = routing_logits.topk(top_k, dim=-1)
+    token_selected_experts = token_selected_experts.to(torch.int32)
+    num_tokens_per_expert = torch.bincount(token_selected_experts.flatten(), minlength=num_experts)
+    num_tokens_per_expert = num_tokens_per_expert[:num_local_experts]
+    num_tiles_per_expert = (num_tokens_per_expert + tile_size - 1) // tile_size
+    num_tokens_per_expert = num_tokens_per_expert.cpu()
+    num_tiles_per_expert = num_tiles_per_expert.cpu()
+    num_valid_tiles = num_tiles_per_expert.sum().item()
+    assert 0 <= num_valid_tiles <= max_num_tiles
+
+    num_non_exiting_tiles = torch.tensor([num_valid_tiles], dtype=torch.int32, device="cuda")
+    tile_idx_to_group_idx = torch.empty(max_num_tiles, dtype=torch.int32)
+    # Note: Fill -2e9 for invalid tiles.
+    tile_idx_to_group_idx.fill_(-2e9)
+    tile_idx = 0
+    for expert_idx in range(num_local_experts):
+        for i in range(num_tiles_per_expert[expert_idx].item()):
+            tile_idx_to_group_idx[tile_idx] = expert_idx
+            tile_idx += 1
+    tile_idx_to_group_idx = tile_idx_to_group_idx.cuda()
+
+    a = torch.randint(
+        -100, 100, (max_num_permuted_tokens, hidden_size // 2), dtype=torch.int32, device="cuda"
+    )
+    a = a.to(torch.int8).view(torch.float4_e2m1fn_x2)
+    a_sf = torch.randint(
+        -100,
+        100,
+        (max_num_permuted_tokens, hidden_size // sf_vec_size),
+        dtype=torch.int32,
+        device="cuda",
+    )
+    a_sf = a_sf.to(torch.float8_e4m3fn).view(torch.uint8).flatten()
+    b = torch.randint(
+        -100,
+        100,
+        (num_local_experts, inter_size, hidden_size // 2),
+        dtype=torch.int32,
+        device="cuda",
+    )
+    b = b.to(torch.int8).view(torch.float4_e2m1fn_x2)
+    b_sf = torch.randint(
+        -100,
+        100,
+        (num_local_experts, inter_size, hidden_size // sf_vec_size),
+        dtype=torch.int32,
+        device="cuda",
+    )
+    b_sf = b_sf.to(torch.float8_e4m3fn).view(torch.uint8)
+    alpha = torch.ones(num_local_experts, dtype=torch.float32, device="cuda")
+
+    c = torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_blackwell(
+        a,
+        b,
+        a_sf,
+        b_sf,
+        alpha,
+        tile_idx_to_group_idx,
+        num_non_exiting_tiles,
+        num_experts=num_experts,
+        top_k=top_k,
+        num_local_experts=num_local_experts,
+        local_expert_offset=0,
+        tile_size=tile_size,
+        output_dtype=torch.bfloat16,
+        scaling_vector_size=sf_vec_size,
+    )
+    c_ref = cute_dsl_nvfp4_grouped_gemm_ref(
+        a,
+        b,
+        a_sf,
+        b_sf,
+        alpha,
+        tile_idx_to_group_idx,
+        num_non_exiting_tiles,
+        tile_size=tile_size,
+        output_dtype=torch.bfloat16,
+        scaling_vector_size=sf_vec_size,
+    )
+    torch.testing.assert_close(
+        c[: num_valid_tiles * tile_size], c_ref[: num_valid_tiles * tile_size]
+    )