codego7250
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_two_sided.py‎
Lines changed: 5 additions & 1 deletion b/‎tensorrt_llm/_torch/modules/fused_moe/communication/nvlink_two_sided.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py‎
Lines changed: 95 additions & 33 deletions b/‎tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py‎
Lines changed: 95 additions & 33 deletions
diff --git a/‎tensorrt_llm/_torch/modules/fused_moe/create_moe.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/modules/fused_moe/create_moe.py‎
Lines changed: 1 addition & 1 deletion
@@ -65,6 +65,10 @@ def __init__(
             os.environ.get("TRTLLM_MOE_POST_QUANT_ALLTOALLV", "1") == "1"
         )
 
+        # Invalid token expert ID (default to -1), the kernels in TRTLLM-gen is hard-coded to support -1 only.
+        # CutlassFusedMoE kernels support any invalid value.
+        self.invalid_token_expert_id: int = -1
+
         # Initialize NVLINK workspaces
         MnnvlMemory.initialize()
         self.alltoall_workspace = MnnvlMoe.get_moe_workspaces(mapping)
@@ -168,7 +172,7 @@ def dispatch(
             alltoall_info.recv_rank_count_cumsum,
             all_rank_max_num_tokens,
             top_k,
-            self.num_slots,
+            self.invalid_token_expert_id,
             self.ep_size,
         )
 
 
@@ -402,6 +402,11 @@ def forward_impl(
         3. Execute MoE computation (single or multiple chunks)
         4. Handle output truncation and EPLB repeat
         """
+        # TODO: to clarify whether the output_dtype is needed.
+        if isinstance(x, Fp4QuantizedTensor):
+            assert output_dtype is not None
+        else:
+            output_dtype = x.dtype
         # ========== Step 1: Handle padding ==========
         if all_rank_num_tokens is None:
             all_rank_num_tokens = [x.shape[0]]
@@ -662,7 +667,7 @@ def _forward_chunk_impl(
             token_final_scales=token_final_scales,
             x_sf=x_sf,
             **self._get_backend_kwargs(
-                router_logits, do_finalize, all_rank_num_tokens, output_dtype
+                router_logits, do_finalize, all_rank_num_tokens, output_dtype, x
             ),
         )
 
@@ -875,12 +880,68 @@ def _is_using_nvlink_two_sided(self) -> bool:
         """Check if using NVLinkTwoSided communication strategy"""
         return isinstance(self.comm, NVLinkTwoSided)
 
+    def _get_nvlink_onesided_moe_output(
+        self,
+        all_rank_num_tokens: Optional[List[int]],
+        output_dtype: Optional[torch.dtype],
+    ) -> Optional[torch.Tensor]:
+        """
+        Get workspace output buffer for NVLinkOneSided communication backend.
+
+        This method handles moe_output allocation for both CutlassFusedMoE and TRTLLMGenFusedMoE
+        when using NVLinkOneSided communication strategy.
+
+        Args:
+            all_rank_num_tokens: Token counts per rank
+            output_dtype: Output data type
+
+        Returns:
+            moe_output tensor if NVLinkOneSided is used and backend supports it, None otherwise
+        """
+        if not isinstance(self.comm, NVLinkOneSided):
+            return None
+
+        # Determine workspace dtype and whether backend supports workspace output
+        workspace_dtype = output_dtype
+        backend_supports_workspace = False
+
+        if isinstance(self.backend, TRTLLMGenFusedMoE):
+            # TRTLLMGen specific configuration
+            self.comm.invalid_token_expert_id = -1
+            workspace_dtype = torch.bfloat16
+            backend_supports_workspace = self.backend.has_w4a8_mxfp4_mxfp8
+        elif isinstance(self.backend, CutlassFusedMoE):
+            # Cutlass always supports workspace output with NVLinkOneSided
+            backend_supports_workspace = True
+
+        if not backend_supports_workspace:
+            # Ensure payload_in_workspace is False if backend doesn't support it
+            self.comm.payload_in_workspace = False
+            return None
+
+        # Calculate runtime max tokens per rank
+        assert all_rank_num_tokens is not None, (
+            "all_rank_num_tokens must be provided for NVLinkOneSided backend"
+        )
+        runtime_max_tokens_per_rank = max(all_rank_num_tokens)
+
+        # Get workspace-backed output tensor
+        moe_output = self.comm.get_combine_payload_tensor_in_workspace(
+            runtime_max_tokens_per_rank, self.hidden_size, workspace_dtype
+        )
+
+        # Dynamically enable payload_in_workspace for this forward pass
+        self.comm.payload_in_workspace = True
+
+        return moe_output
+
     def _get_backend_kwargs(
         self,
         router_logits: Optional[torch.Tensor] = None,
         do_finalize: bool = True,
         all_rank_num_tokens: Optional[List[int]] = None,
         output_dtype: Optional[torch.dtype] = None,
+        x: Optional[torch.Tensor] = None,
     ) -> Dict:
         """
         Get backend-specific keyword arguments for run_moe
@@ -905,6 +966,8 @@ def _get_backend_kwargs(
             router_logits: Router logits tensor (for TRTLLMGen backend)
             do_finalize: Whether to finalize output (for TRTLLMGen backend)
             all_rank_num_tokens: Token counts per rank (for TRTLLMGen backend moe_output)
+            output_dtype: Output data type
+            x: Input tensor (for calculating tuner_num_tokens in Cutlass)
 
         Returns:
             Dict: Backend-specific keyword arguments
@@ -917,7 +980,33 @@ def _get_backend_kwargs(
 
         # Cutlass-specific parameters
         if self.backend.__class__ == CutlassFusedMoE:
-            pass
+            # Determine if scaling factors are swizzled based on communication flow
+            # In post-quant communication (quantize -> dispatch), scaling factors are not swizzled
+            # In pre-quant communication (dispatch -> quantize), scaling factors are swizzled
+            supports_post_quant = self.comm is not None and self.comm.supports_post_quant_dispatch()
+            kwargs["is_sf_swizzled"] = not supports_post_quant
+            kwargs["output_dtype"] = output_dtype
+
+            # Prepare additional information for profiling in case padding is applied when using alltoall.
+            # Only the non-alltoall case is considered for profiling in the warmup phase.
+            # Therefore, to get the correct tactics during the actual inference, the inputs to the tuner
+            # should be the same as when not using alltoall.
+            if self._is_using_alltoall():
+                if all_rank_num_tokens is not None:
+                    kwargs["tuner_num_tokens"] = sum(all_rank_num_tokens)
+                else:
+                    kwargs["tuner_num_tokens"] = (
+                        x.shape[0] * self.mapping.tp_size if x is not None else None
+                    )
+                kwargs["tuner_top_k"] = self.routing_method.top_k
+            else:
+                kwargs["tuner_num_tokens"] = None
+                kwargs["tuner_top_k"] = None
+
+            # Get moe_output for NVLinkOneSided backend
+            kwargs["moe_output"] = self._get_nvlink_onesided_moe_output(
+                all_rank_num_tokens, output_dtype
+            )
 
         # CuteDSL-specific parameters
         elif self.backend.__class__ == CuteDslFusedMoE:
@@ -940,37 +1029,10 @@ def _get_backend_kwargs(
             kwargs["router_logits"] = router_logits_arg
             kwargs["do_finalize"] = do_finalize
 
-            # moe_output: workspace output buffer for NVLINK one-sided backend
-            # TRTLLMGenFusedMoE only supports workspace output for w4a8_mxfp4_mxfp8 quantization.
-            moe_output = None
-            if isinstance(self.comm, NVLinkOneSided):
-                # Determine dtype for workspace tensor
-                # TRTLLMGenFusedMoE always uses bfloat16, other backends use output_dtype
-                workspace_dtype = output_dtype
-                if isinstance(self.backend, TRTLLMGenFusedMoE):
-                    self.comm.invalid_token_expert_id = -1
-                    workspace_dtype = torch.bfloat16
-
-                # Check if backend supports workspace output for current quantization
-                backend_supports_workspace = (
-                    isinstance(self.backend, TRTLLMGenFusedMoE)
-                    and self.backend.has_w4a8_mxfp4_mxfp8
-                )
-                if backend_supports_workspace:
-                    assert all_rank_num_tokens is not None, (
-                        "all_rank_num_tokens must be provided for NVLinkOneSided backend with workspace output"
-                    )
-                    runtime_max_tokens_per_rank = max(all_rank_num_tokens)
-
-                    moe_output = self.comm.get_combine_payload_tensor_in_workspace(
-                        runtime_max_tokens_per_rank, self.hidden_size, workspace_dtype
-                    )
-                    # Dynamically enable payload_in_workspace for this forward pass
-                    self.comm.payload_in_workspace = True
-                else:
-                    # Ensure payload_in_workspace is False for non-workspace output
-                    self.comm.payload_in_workspace = False
-            kwargs["moe_output"] = moe_output
+            # Get moe_output for NVLinkOneSided backend
+            kwargs["moe_output"] = self._get_nvlink_onesided_moe_output(
+                all_rank_num_tokens, output_dtype
+            )
 
         return kwargs
 
 
@@ -346,7 +346,7 @@ def create_moe(
 
     if ENABLE_CONFIGURABLE_MOE or moe_cls == CuteDslFusedMoE:
         # ConfigurableMoE only supports TRTLLMGenFusedMoE and CuteDslFusedMoE backends
-        if moe_cls in (TRTLLMGenFusedMoE, CuteDslFusedMoE):
+        if moe_cls in (TRTLLMGenFusedMoE, CuteDslFusedMoE, CutlassFusedMoE):
             return ConfigurableMoE(
                 routing_method=routing_method,
                 num_experts=num_experts,