LarryXFly
diff --git a/‎cpp/tensorrt_llm/kernels/moeUtilOp.cu‎
Lines changed: 893 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/moeUtilOp.cu‎
Lines changed: 893 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/moeUtilOp.h‎
Lines changed: 56 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/moeUtilOp.h‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/quantization.cuh‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/quantization.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/thop/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/thop/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/thop/moeUtilOp.cpp‎
Lines changed: 449 additions & 0 deletions b/‎cpp/tensorrt_llm/thop/moeUtilOp.cpp‎
Lines changed: 449 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py‎
Lines changed: 76 additions & 0 deletions b/‎tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/create_moe.py‎
Lines changed: 17 additions & 0 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/create_moe.py‎
Lines changed: 17 additions & 0 deletions
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "cutlass_kernels/include/moe_kernels.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace tensorrt_llm::kernels
+{
+bool fusedBuildExpertMapsSortFirstToken(int const* token_selected_experts, int* unpermuted_token_selected_experts,
+    int* permuted_source_token_ids, int64_t* expert_first_token_offset, int64_t const num_tokens,
+    int const num_experts_per_node, int const experts_per_token, int const start_expert, int const end_expert,
+    cudaStream_t stream);
+
+void buildExpertMaps(int const* token_selected_experts, int* unpermuted_token_selected_experts,
+    int* unpermuted_source_token_ids, int64_t const num_tokens, int const num_experts_per_node,
+    int const experts_per_token, int const start_expert, int const end_expert, cudaStream_t stream);
+
+void generateTokenPermutation(int const* unpermuted_token_selected_experts, int const* unpermuted_source_token_ids,
+    int* permuted_token_selected_experts, int* permuted_source_token_ids, int64_t* expert_first_token_offset,
+    int64_t num_rows, int64_t num_experts_per_node, int64_t k, cutlass_kernels::CubKeyValueSorter& sorter,
+    void* sorter_ws, cudaStream_t stream);
+
+template <class InputActivationsType, class ExpandedActivationsType>
+void expandInputRowsKernelLauncher(InputActivationsType const* unpermuted_input,
+    ExpandedActivationsType* permuted_output, float const* unpermuted_scales, float* permuted_scales,
+    int const* expanded_dest_row_to_expanded_source_row, int* expanded_source_row_to_expanded_dest_row,
+    int64_t const num_rows, int64_t const* num_valid_tokens_ptr, int64_t const cols, int const k,
+    int const num_experts_per_node, float const* fc1_act_global_scale, int64_t* expert_first_token_offset,
+    cutlass_kernels::TmaWarpSpecializedGroupedGemmInput::ElementSF* fc1_act_sf_flat,
+    cutlass_kernels::TmaWarpSpecializedGroupedGemmInput::ElementSF const* input_sf, cudaStream_t stream);
+
+template <class OutputType, class GemmOutputType, class ScaleBiasType>
+void finalizeMoeRoutingKernelLauncher(GemmOutputType const* expanded_permuted_rows,
+    OutputType* reduced_unpermuted_output, ScaleBiasType const* bias, float const* final_scales,
+    int const* expanded_source_row_to_expanded_dest_row, int const* expert_for_source_row, int64_t const num_rows,
+    int64_t const cols, int64_t const experts_per_token, int64_t const* num_valid_ptr,
+    cutlass_kernels::MOEParallelismConfig parallelism_config, cudaStream_t stream);
+
+} // namespace tensorrt_llm::kernels
@@ -275,7 +275,7 @@ __global__ void perTokenQuantization(QuantT* dst, T const* src, int64_t const nu
 // FP4 Quantization
 
 constexpr int CVT_FP4_ELTS_PER_THREAD = 8;
-// constexpr int CVT_FP4_SF_VEC_SIZE = 16;
+constexpr int CVT_FP4_SF_VEC_SIZE = 16;
 constexpr int CVT_FP4_THREADS_PER_WARP = 32;
 constexpr int CVT_FP8_TO_FP4_ELTS_PER_THREAD = 16;
 
 
@@ -65,6 +65,7 @@ add_library(
   logitsBitmaskOp.cpp
   mambaConv1dOp.cpp
   moeOp.cpp
+  moeUtilOp.cpp
   moeCommOp.cpp
   moeLoadBalanceOp.cpp
   fp8BlockScaleMoe.cpp
 
@@ -397,3 +397,79 @@ def _(
         pad_slot_id: int,
     ) -> None:
         pass
+
+    @torch.library.register_fake("trtllm::moe_permute_op")
+    def _(
+        input: torch.Tensor,
+        token_selected_experts: torch.Tensor,
+        token_final_scales: torch.Tensor,
+        fc1_expert_weights: torch.Tensor,
+        fc2_expert_weights: torch.Tensor,
+        quant_scales: List[torch.Tensor],
+        input_sf: Optional[torch.Tensor],
+        num_experts_per_node: int,
+        tp_size: int,
+        tp_rank: int,
+        ep_size: int,
+        ep_rank: int,
+        cluster_size: int,
+        cluster_rank: int,
+        min_latency_mode: bool,
+        use_fp8_block_scaling: bool,
+    ):
+
+        experts_per_token = token_selected_experts.shape[1]
+        num_rows = input.shape[0]
+        hidden_size = input.shape[1]
+
+        num_moe_inputs = experts_per_token * num_rows
+
+        unpermuted_token_selected_experts_tensor = token_selected_experts.new_empty(
+            (num_moe_inputs, ), dtype=torch.int32)
+        unpermuted_source_token_ids_tensor = token_selected_experts.new_empty(
+            (num_moe_inputs, ), dtype=torch.int32)
+        permuted_source_token_ids_tensor = token_selected_experts.new_empty(
+            (num_moe_inputs, ), dtype=torch.int32)
+        permuted_token_selected_experts_tensor = token_selected_experts.new_empty(
+            (num_moe_inputs, ), dtype=torch.int32)
+        permuted_data_tensor = input.new_empty((num_moe_inputs, hidden_size),
+                                               dtype=torch.float32)
+        expert_first_token_offset_tensor = token_selected_experts.new_empty(
+            (num_experts_per_node + 1, ), dtype=torch.int64)
+        permuted_token_final_scales_tensor = token_selected_experts.new_empty(
+            (num_moe_inputs, ), dtype=torch.float32)
+        src_to_dest_map_tensor = token_selected_experts.new_empty(
+            (num_moe_inputs, ), dtype=torch.int32)
+
+        return (
+            unpermuted_token_selected_experts_tensor,
+            unpermuted_source_token_ids_tensor,
+            permuted_source_token_ids_tensor,
+            permuted_token_selected_experts_tensor,
+            permuted_data_tensor,
+            expert_first_token_offset_tensor,
+            permuted_token_final_scales_tensor,
+            src_to_dest_map_tensor,
+        )
+
+    @torch.library.register_fake("trtllm::moe_finalize_scale_op")
+    def _(
+        gemm2_output: torch.Tensor,
+        fc2_expert_biases: torch.Tensor,
+        unpermuted_final_scales: torch.Tensor,
+        expanded_source_row_to_expanded_dest_row: torch.Tensor,
+        expert_for_source_row: torch.Tensor,
+        expert_first_token_offset_tensor: torch.Tensor,
+        num_rows: torch.SymInt,
+        hidden_size: torch.SymInt,
+        experts_per_token: int,
+        num_experts_per_node: int,
+        tp_size: int,
+        tp_rank: int,
+        ep_size: int,
+        ep_rank: int,
+    ):
+        num_rows_val = int(num_rows)
+        hidden_size_val = int(hidden_size)
+        return gemm2_output.new_empty((num_rows_val, hidden_size_val),
+                                      dtype=gemm2_output.dtype)
@@ -1,4 +1,5 @@
 from .create_moe import create_moe, get_moe_cls
+from .fused_moe_cute_dsl import CuteDslFusedMoE
 from .fused_moe_cutlass import CutlassFusedMoE
 from .fused_moe_trtllm_gen import TRTLLMGenFusedMoE
 from .fused_moe_vanilla import VanillaMoE
@@ -17,6 +18,7 @@
 __all__ = [
     "BaseMoeRoutingMethod",
     "create_moe",
+    "CuteDslFusedMoE",
     "CutlassFusedMoE",
     "DeepSeekV3MoeRoutingMethod",
     "DefaultMoeRoutingMethod",
 
@@ -6,6 +6,7 @@
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
 from ...model_config import ModelConfig
+from .fused_moe_cute_dsl import CuteDslFusedMoE
 from .fused_moe_cutlass import CutlassFusedMoE
 from .fused_moe_trtllm_gen import TRTLLMGenFusedMoE
 from .fused_moe_vanilla import VanillaMoE
@@ -28,6 +29,8 @@ def get_moe_cls(
         return CutlassFusedMoE
     elif moe_backend.upper() == "VANILLA":
         return VanillaMoE
+    elif moe_backend.upper() == "CUTEDSL":
+        return CuteDslFusedMoE
     elif moe_backend.upper() == "TRTLLM":
         if quant_config is not None and (
                 quant_config.quant_mode.has_fp8_block_scales()
@@ -122,5 +125,19 @@ def create_moe(
             weight_loading_mode=weight_loading_mode,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
+    elif moe_cls == CuteDslFusedMoE:
+        return moe_cls(
+            routing_method=routing_method,
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            dtype=dtype,
+            reduce_results=reduce_results,
+            model_config=model_config,
+            aux_stream=aux_stream,
+            weight_loading_mode=weight_loading_mode,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            layer_idx=layer_idx,
+        )
     else:
         raise ValueError(f"Unsupported moe backend: {moe_cls}")