Simplify FP8 AllGather implementation by reusing regular all_gather

jasonlizhengjian · jasonlizhengjian · commit 749c35a153ca · 2025-10-05T16:37:23.000Z
The regular torch.ops.vllm.all_gather already supports FP8 tensors via
pynccl updates (added ncclFp8E4M3 and ncclFp8E5M2 types). There's no
need for a separate vllm_all_gather_fp8 custom op or FP8-specific
AsyncTP patterns.

Changes:
- FP8AllGatherOptPass now uses regular all_gather with FP8 tensors
- Remove vllm_all_gather_fp8 custom op (fp8_collective_ops.py)
- Remove AllGatherFP8ScaledMMPattern and AllGatherFP8CutlassScaledMMPattern
- Existing AllGatherScaledMMPattern patterns handle FP8 automatically

Benefits:
- Simpler implementation (127 lines removed)
- Reuses existing AsyncTP fusion patterns
- No duplicate pattern matching logic

Signed-off-by: jasonlizhengjian &lt;jasonlizhengjian@gmail.com&gt;
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
@@ -364,126 +364,6 @@ def replacement(x: torch.Tensor, weight: torch.Tensor,
                                 pm.fwd_only, pm_pass)
 
 
-class AllGatherFP8ScaledMMPattern(BasePattern):
-    """Fuse vllm_all_gather_fp8 + ScaledMM (after FP8AllGatherOptPass)"""
-
-    def get_inputs(self):
-        x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
-        weight = torch.empty([16, 16], device=self.device,
-                             dtype=FP8_DTYPE).contiguous().transpose(0, 1)
-
-        s1 = x.shape[0] * self.tp_size
-        scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32)
-        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
-
-        return [x, weight, scale_a, scale_b]
-
-    def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(
-            x: torch.Tensor,
-            weight: torch.Tensor,
-            scale_a: torch.Tensor,
-            scale_b: torch.Tensor,
-        ) -> torch.Tensor:
-            all_gather = torch.ops.vllm.vllm_all_gather_fp8.default(
-                x,
-                dim=0,
-                world_size=self.tp_size,
-                group_name=self.tp.unique_name)
-
-            return torch.ops.aten._scaled_mm.default(all_gather,
-                                                     mat2=weight,
-                                                     scale_a=scale_a,
-                                                     scale_b=scale_b,
-                                                     bias=None,
-                                                     scale_result=None,
-                                                     out_dtype=self.dtype)
-
-        def replacement(x: torch.Tensor, weight: torch.Tensor,
-                        scale_a: torch.Tensor,
-                        scale_b: torch.Tensor) -> torch.Tensor:
-            ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul(  # noqa
-                x,
-                [weight],
-                scale_a,
-                [scale_b],
-                gather_dim=0,
-                biases=[None],
-                result_scales=[None],
-                out_dtypes=[self.dtype],
-                use_fast_accum=[False],
-                group_name=self.tp.device_group.group_name,
-            )
-            return mm_outputs
-
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
-
-
-class AllGatherFP8CutlassScaledMMPattern(BasePattern):
-    """Fuse vllm_all_gather_fp8 + CutlassScaledMM (after FP8AllGatherOptPass)"""
-
-    def get_inputs(self):
-        x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
-        weight = torch.empty([16, 16], device=self.device,
-                             dtype=FP8_DTYPE).contiguous().transpose(0, 1)
-
-        s1 = x.shape[0] * self.tp_size
-        scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32)
-        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
-
-        s2 = weight.shape[1]
-        output = torch.empty([s1, s2], device=self.device, dtype=self.dtype)
-
-        return [x, weight, scale_a, scale_b, output]
-
-    def register(self, pm_pass: PatternMatcherPass):
-
-        def pattern(
-            x: torch.Tensor,
-            weight: torch.Tensor,
-            scale_a: torch.Tensor,
-            scale_b: torch.Tensor,
-            output: torch.Tensor,
-        ) -> torch.Tensor:
-            all_gather = torch.ops.vllm.vllm_all_gather_fp8.default(
-                x,
-                dim=0,
-                world_size=self.tp_size,
-                group_name=self.tp.unique_name)
-
-            cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized(
-                torch.ops._C.cutlass_scaled_mm.default,
-                out=output,
-                a=all_gather,
-                b=weight,
-                a_scales=scale_a,
-                b_scales=scale_b,
-                bias=None)
-            return cutlass_scaled_mm[1]
-
-        def replacement(x: torch.Tensor, weight: torch.Tensor,
-                        scale_a: torch.Tensor, scale_b: torch.Tensor,
-                        output: torch.Tensor) -> torch.Tensor:
-            ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul(  # noqa
-                x,
-                [weight],
-                scale_a,
-                [scale_b],
-                gather_dim=0,
-                biases=[None],
-                result_scales=[None],
-                out_dtypes=[self.dtype],
-                use_fast_accum=[False],
-                group_name=self.tp.device_group.group_name,
-            )
-            return mm_outputs
-
-        pm.register_replacement(pattern, replacement, self.get_inputs(),
-                                pm.fwd_only, pm_pass)
-
-
 class AsyncTPPass(VllmPatternMatcherPass):
 
     @enable_fake_mode
@@ -514,13 +394,6 @@ def __init__(self, config: VllmConfig):
             AllGatherCutlassScaledMMPattern(
                 self.model_dtype, self.device).register(self.patterns)
 
-            # Patterns for FP8 AllGather (after FP8AllGatherOptPass)
-            # These enable AsyncTP-style fusion on the optimized FP8 path
-            AllGatherFP8ScaledMMPattern(self.model_dtype,
-                                        self.device).register(self.patterns)
-            AllGatherFP8CutlassScaledMMPattern(
-                self.model_dtype, self.device).register(self.patterns)
-
         self.dump_patterns(config, self.patterns)
 
     def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
diff --git a/vllm/compilation/fp8_allgather_pass.py b/vllm/compilation/fp8_allgather_pass.py
@@ -10,7 +10,6 @@
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 
-from .fp8_collective_ops import vllm_all_gather_fp8
 from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
 
@@ -91,7 +90,8 @@ def replacement(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
             x_fp8 = x_clamped.to(self.fp8_dtype)
 
             # Step 2: AllGather FP8 tensors (2x less bandwidth!)
-            gathered_fp8 = vllm_all_gather_fp8(
+            # Use regular all_gather - it supports FP8 via pynccl updates
+            gathered_fp8 = torch.ops.vllm.all_gather.default(
                 x_fp8,
                 dim=0,
                 world_size=self.tp_size,
diff --git a/vllm/compilation/fp8_collective_ops.py b/vllm/compilation/fp8_collective_ops.py