fix alltoall

syuoni · syuoni · commit 2082a0c9ed8a · 2025-12-03T01:37:11.000Z
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/compilation/utils.py b/tensorrt_llm/_torch/compilation/utils.py
@@ -80,7 +80,7 @@ def inplace_info():
         torch.ops.trtllm.moe_output_memset_inplace.default: {
             1: "input"
         },
-        torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_finalize_blackwell.default:
+        torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_finalize_inplace_blackwell.default:
         {
             6: "output"
         }
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -1182,16 +1182,16 @@ def forward(self, inputs: List[torch.Tensor],
             return c
 
     @torch.library.custom_op(
-        "trtllm::cute_dsl_nvfp4_grouped_gemm_finalize_blackwell",
+        "trtllm::cute_dsl_nvfp4_grouped_gemm_finalize_inplace_blackwell",
         mutates_args=("output", ),
         device_types="cuda")
-    def cute_dsl_nvfp4_grouped_gemm_finalize_blackwell(
+    def cute_dsl_nvfp4_grouped_gemm_finalize_inplace_blackwell(
         input: torch.Tensor,
         weight: torch.Tensor,
         input_scale: torch.Tensor,
         weight_scale: torch.Tensor,
         alpha: torch.Tensor,
-        output: Optional[torch.Tensor],
+        output: torch.Tensor,
         tile_idx_to_group_idx: torch.Tensor,
         tile_idx_to_mn_limit: torch.Tensor,
         permuted_idx_to_expanded_idx: torch.Tensor,
@@ -1204,21 +1204,13 @@ def cute_dsl_nvfp4_grouped_gemm_finalize_blackwell(
         tile_size: int,
         output_dtype: torch.dtype,
         scaling_vector_size: int = 16,
-    ) -> torch.Tensor:
+    ) -> None:
         tuner = AutoTuner.get()
 
         runner = Sm100BlockScaledContiguousGroupedGemmFinalizeFusionRunner(
             num_experts, top_k, num_local_experts, local_expert_offset,
             tile_size, output_dtype, scaling_vector_size)
 
-        if output is None:
-            num_tokens = token_final_scales.size(0)
-            n = weight.size(1)
-            output = torch.zeros(num_tokens,
-                                 n,
-                                 dtype=output_dtype,
-                                 device=input.device)
-
         inputs = [
             input, weight, input_scale, weight_scale, alpha, output,
             tile_idx_to_group_idx, tile_idx_to_mn_limit,
@@ -1227,12 +1219,62 @@ def cute_dsl_nvfp4_grouped_gemm_finalize_blackwell(
         ]
 
         _, best_tactic = tuner.choose_one(
-            "trtllm::cute_dsl_nvfp4_grouped_gemm_finalize_blackwell",
+            "trtllm::cute_dsl_nvfp4_grouped_gemm_finalize_inplace_blackwell",
             [runner],
             runner.get_tuning_config(),
             inputs,
         )
-        output = runner(inputs, tactic=best_tactic)
+        runner(inputs, tactic=best_tactic)
+
+    @torch.library.custom_op(
+        "trtllm::cute_dsl_nvfp4_grouped_gemm_finalize_blackwell",
+        mutates_args=(),
+        device_types="cuda")
+    def cute_dsl_nvfp4_grouped_gemm_finalize_blackwell(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        input_scale: torch.Tensor,
+        weight_scale: torch.Tensor,
+        alpha: torch.Tensor,
+        tile_idx_to_group_idx: torch.Tensor,
+        tile_idx_to_mn_limit: torch.Tensor,
+        permuted_idx_to_expanded_idx: torch.Tensor,
+        num_non_exiting_tiles: torch.Tensor,
+        token_final_scales: torch.Tensor,
+        num_experts: int,
+        top_k: int,
+        num_local_experts: int,
+        local_expert_offset: int,
+        tile_size: int,
+        output_dtype: torch.dtype,
+        scaling_vector_size: int = 16,
+    ) -> torch.Tensor:
+        num_tokens = token_final_scales.size(0)
+        n = weight.size(1)
+        output = torch.zeros(num_tokens,
+                             n,
+                             dtype=output_dtype,
+                             device=input.device)
+        torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_finalize_inplace_blackwell(
+            input=input,
+            weight=weight,
+            input_scale=input_scale,
+            weight_scale=weight_scale,
+            alpha=alpha,
+            output=output,
+            tile_idx_to_group_idx=tile_idx_to_group_idx,
+            tile_idx_to_mn_limit=tile_idx_to_mn_limit,
+            permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx,
+            num_non_exiting_tiles=num_non_exiting_tiles,
+            token_final_scales=token_final_scales,
+            num_experts=num_experts,
+            top_k=top_k,
+            num_local_experts=num_local_experts,
+            local_expert_offset=local_expert_offset,
+            tile_size=tile_size,
+            output_dtype=output_dtype,
+            scaling_vector_size=scaling_vector_size,
+        )
         return output
 
     @torch.library.register_fake(
@@ -1243,7 +1285,6 @@ def _(
         input_scale: torch.Tensor,
         weight_scale: torch.Tensor,
         alpha: torch.Tensor,
-        output: Optional[torch.Tensor],
         tile_idx_to_group_idx: torch.Tensor,
         tile_idx_to_mn_limit: torch.Tensor,
         permuted_idx_to_expanded_idx: torch.Tensor,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@@ -908,27 +908,27 @@ def _get_backend_kwargs(
         kwargs = {}
 
         # Common parameters for Cutlass and DeepGemm
-        if isinstance(self.backend, (CutlassFusedMoE, DeepGemmFusedMoE)):
+        if self.backend.__class__ in (CutlassFusedMoE, DeepGemmFusedMoE, CuteDslFusedMoE):
             pass
 
         # Cutlass-specific parameters
-        if isinstance(self.backend, CutlassFusedMoE):
+        if self.backend.__class__ == CutlassFusedMoE:
             pass
 
         # CuteDSL-specific parameters
-        elif isinstance(self.backend, CuteDslFusedMoE):
+        elif self.backend.__class__ == CuteDslFusedMoE:
             kwargs["enable_alltoall"] = self.enable_alltoall
 
         # WideEP-specific parameters
-        elif isinstance(self.backend, WideEPMoE):
+        elif self.backend.__class__ == WideEPMoE:
             pass
 
         # DeepGemm-specific parameters
-        elif isinstance(self.backend, DeepGemmFusedMoE):
+        elif self.backend.__class__ == DeepGemmFusedMoE:
             pass
 
         # TRTLLMGen-specific parameters
-        elif isinstance(self.backend, TRTLLMGenFusedMoE):
+        elif self.backend.__class__ == TRTLLMGenFusedMoE:
             # Determine router_logits based on whether routing has been done
             # If backend doesn't support load balancer, routing is done before communication
             # In that case, router_logits should be None (routing already done)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@@ -259,7 +259,6 @@ def run_moe_nvfp4(
         enable_alltoall: bool = False,
     ) -> torch.Tensor:
         assert self.has_nvfp4
-        output_shape = x.size()
         output_dtype = torch.bfloat16
         tile_size = 128
 
@@ -297,21 +296,22 @@ def run_moe_nvfp4(
             tile_size=tile_size,
         )
         if self.use_fused_finalize:
-            output = None
+            output = torch.empty((token_final_scales.size(0), self.hidden_size),
+                                 dtype=output_dtype,
+                                 device=x.device)
             if enable_alltoall:
-                output = torch.empty(output_shape,
-                                     dtype=output_dtype,
-                                     device=x.device)
                 torch.ops.trtllm.moe_output_memset_inplace(
-                    output=output,
+                    input=output,
                     tile_idx_to_mn_limit=tile_idx_to_mn_limit,
                     expanded_idx_to_permuted_idx=expanded_idx_to_permuted_idx,
                     permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx,
                     num_non_exiting_tiles=num_non_exiting_tiles,
-                    tile_size=tile_size,
+                    tile_tokens_dim=tile_size,
                     top_k=self.routing_method.experts_per_token,
                 )
-            x = torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_finalize_blackwell(
+            else:
+                output.fill_(0)
+            torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_finalize_inplace_blackwell(
                 input=x.view(torch.float4_e2m1fn_x2),
                 weight=self.w2_weight.view(torch.float4_e2m1fn_x2),
                 input_scale=x_sf.view(torch.uint8),
@@ -331,6 +331,7 @@ def run_moe_nvfp4(
                 tile_size=tile_size,
                 output_dtype=output_dtype,
             )
+            x = output
         else:
             x = torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_blackwell(
                 input=x.view(torch.float4_e2m1fn_x2),
@@ -462,13 +463,15 @@ def run_moe(
                 x=x,
                 token_selected_experts=token_selected_experts,
                 token_final_scales=token_final_scales,
-                x_sf=x_sf)
+                x_sf=x_sf,
+                enable_alltoall=enable_alltoall)
         elif self.has_deepseek_fp8_block_scales:
             return self.run_moe_fp8_block_scales(
                 x=x,
                 token_selected_experts=token_selected_experts,
                 token_final_scales=token_final_scales,
-                x_sf=x_sf)
+                x_sf=x_sf,
+                enable_alltoall=enable_alltoall)
         else:
             raise ValueError(
                 f"{self.__class__.__name__} doesn't support quantization mode {self.quant_config.quant_mode}."
diff --git a/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py b/tests/unittest/_torch/thop/parallel/test_cute_dsl_moe.py
@@ -535,7 +535,6 @@ def test_nvfp4_grouped_gemm_finalize_blackwell(
         a_sf,
         b_sf,
         alpha,
-        None,  # output
         tile_idx_to_group_idx,
         tile_idx_to_mn_limit,
         permuted_idx_to_expanded_idx,

Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,7 @@ def inplace_info():`
`80`	`80`	`torch.ops.trtllm.moe_output_memset_inplace.default: {`
`81`	`81`	`1: "input"`
`82`	`82`	`},`
`83`		`- torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_finalize_blackwell.default:`
	`83`	`+ torch.ops.trtllm.cute_dsl_nvfp4_grouped_gemm_finalize_inplace_blackwell.default:`
`84`	`84`	`{`
`85`	`85`	`6: "output"`
`86`	`86`	`}`