Fix fused_scaled_matmul_reduce_scatter signature for PyTorch update

jasonlizhengjian · jasonlizhengjian · commit 49a3b8aff60b · 2025-10-01T18:09:56.000Z
Updated torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter calls to match
the new PyTorch API signature. The function signature changed from PyTorch 2.7.1
to require additional positional parameters.

Changes:
- Added orig_scatter_dim and scatter_dim_after_maybe_reshape as positional parameters
- Added output_shape calculation: [*input.shape[:-1], mat2.shape[1]]
- Changed all optional parameters (bias, result_scale, out_dtype, use_fast_accum)
  from keyword arguments to positional arguments to match PyTorch's torch._inductor
  implementation

References:
- PyTorch function definition: torch/distributed/_symmetric_memory/__init__.py:454-461
- PyTorch test usage: test/distributed/test_symmetric_memory.py:579-590
- PyTorch inductor usage: torch/_inductor/fx_passes/micro_pipeline_tp.py:816-834

Signed-off-by: jasonlizhengjian &lt;jasonlizhengjian@gmail.com&gt;
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
@@ -156,15 +156,23 @@ def pattern(input: torch.Tensor, mat2: torch.Tensor,
         def replacement(input: torch.Tensor, mat2: torch.Tensor,
                         scale_a: torch.Tensor,
                         scale_b: torch.Tensor) -> torch.Tensor:
+            # Calculate output shape: input @ mat2 with scatter_dim reduced
+            output_shape = [*input.shape[:-1], mat2.shape[1]]
+            scatter_dim = 0
             gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
                 input,
                 mat2,
                 scale_a,
                 scale_b,
                 "avg",
-                scatter_dim=0,
-                out_dtype=self.dtype,
-                group_name=self.tp.device_group.group_name,
+                scatter_dim,  # orig_scatter_dim
+                scatter_dim,  # scatter_dim_after_maybe_reshape
+                self.tp.device_group.group_name,
+                output_shape,
+                None,  # bias
+                None,  # result_scale
+                self.dtype,  # out_dtype
+                False,  # use_fast_accum
             )
 
             return gemm_rs
@@ -268,15 +276,23 @@ def pattern(input: torch.Tensor, weight: torch.Tensor,
         def replacement(input: torch.Tensor, mat2: torch.Tensor,
                         scale_a: torch.Tensor, scale_b: torch.Tensor,
                         cutlass_mm_output: torch.Tensor) -> torch.Tensor:
+            # Calculate output shape: input @ mat2 with scatter_dim reduced
+            output_shape = [*input.shape[:-1], mat2.shape[1]]
+            scatter_dim = 0
             gemm_rs = torch.ops.symm_mem.fused_scaled_matmul_reduce_scatter(
                 input,
                 mat2,
                 scale_a,
                 scale_b,
                 "avg",
-                scatter_dim=0,
-                out_dtype=self.dtype,
-                group_name=self.tp.device_group.group_name,
+                scatter_dim,  # orig_scatter_dim
+                scatter_dim,  # scatter_dim_after_maybe_reshape
+                self.tp.device_group.group_name,
+                output_shape,
+                None,  # bias
+                None,  # result_scale
+                self.dtype,  # out_dtype
+                False,  # use_fast_accum
             )
 
             return gemm_rs