Add AsyncTP fusion patterns for FP8 AllGather

jasonlizhengjian · jasonlizhengjian · commit 75dc2796a972 · 2025-10-05T16:37:23.000Z
Adds AllGatherFP8ScaledMMPattern and AllGatherFP8CutlassScaledMMPattern
to enable AsyncTP-style fusion after FP8AllGatherOptPass runs.

This enables:
- Communication/computation overlap for FP8 AllGather + ScaledMM
- Reduced kernel launch overhead
- Better memory access patterns

Pattern matching sequence:
1. FP8AllGatherOptPass: AllGather(BF16) + to(FP8) -&gt; vllm_all_gather_fp8
2. AsyncTPPass: vllm_all_gather_fp8 + ScaledMM -&gt; fused_all_gather_scaled_matmul

This combines 2x bandwidth reduction (FP8) with computation overlap (AsyncTP).

Signed-off-by: jasonlizhengjian &lt;jasonlizhengjian@gmail.com&gt;
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
@@ -364,6 +364,126 @@ def replacement(x: torch.Tensor, weight: torch.Tensor,
                                 pm.fwd_only, pm_pass)
 
 
+class AllGatherFP8ScaledMMPattern(BasePattern):
+    """Fuse vllm_all_gather_fp8 + ScaledMM (after FP8AllGatherOptPass)"""
+
+    def get_inputs(self):
+        x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
+        weight = torch.empty([16, 16], device=self.device,
+                             dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+
+        s1 = x.shape[0] * self.tp_size
+        scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32)
+        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
+
+        return [x, weight, scale_a, scale_b]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+        ) -> torch.Tensor:
+            all_gather = torch.ops.vllm.vllm_all_gather_fp8.default(
+                x,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+
+            return torch.ops.aten._scaled_mm.default(all_gather,
+                                                     mat2=weight,
+                                                     scale_a=scale_a,
+                                                     scale_b=scale_b,
+                                                     bias=None,
+                                                     scale_result=None,
+                                                     out_dtype=self.dtype)
+
+        def replacement(x: torch.Tensor, weight: torch.Tensor,
+                        scale_a: torch.Tensor,
+                        scale_b: torch.Tensor) -> torch.Tensor:
+            ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul(  # noqa
+                x,
+                [weight],
+                scale_a,
+                [scale_b],
+                gather_dim=0,
+                biases=[None],
+                result_scales=[None],
+                out_dtypes=[self.dtype],
+                use_fast_accum=[False],
+                group_name=self.tp.device_group.group_name,
+            )
+            return mm_outputs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllGatherFP8CutlassScaledMMPattern(BasePattern):
+    """Fuse vllm_all_gather_fp8 + CutlassScaledMM (after FP8AllGatherOptPass)"""
+
+    def get_inputs(self):
+        x = torch.empty([8, 16], device=self.device, dtype=FP8_DTYPE)
+        weight = torch.empty([16, 16], device=self.device,
+                             dtype=FP8_DTYPE).contiguous().transpose(0, 1)
+
+        s1 = x.shape[0] * self.tp_size
+        scale_a = torch.empty([s1, 1], device=self.device, dtype=torch.float32)
+        scale_b = torch.empty([1, 16], device=self.device, dtype=torch.float32)
+
+        s2 = weight.shape[1]
+        output = torch.empty([s1, s2], device=self.device, dtype=self.dtype)
+
+        return [x, weight, scale_a, scale_b, output]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+            scale_a: torch.Tensor,
+            scale_b: torch.Tensor,
+            output: torch.Tensor,
+        ) -> torch.Tensor:
+            all_gather = torch.ops.vllm.vllm_all_gather_fp8.default(
+                x,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+
+            cutlass_scaled_mm = torch.ops.higher_order.auto_functionalized(
+                torch.ops._C.cutlass_scaled_mm.default,
+                out=output,
+                a=all_gather,
+                b=weight,
+                a_scales=scale_a,
+                b_scales=scale_b,
+                bias=None)
+            return cutlass_scaled_mm[1]
+
+        def replacement(x: torch.Tensor, weight: torch.Tensor,
+                        scale_a: torch.Tensor, scale_b: torch.Tensor,
+                        output: torch.Tensor) -> torch.Tensor:
+            ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_scaled_matmul(  # noqa
+                x,
+                [weight],
+                scale_a,
+                [scale_b],
+                gather_dim=0,
+                biases=[None],
+                result_scales=[None],
+                out_dtypes=[self.dtype],
+                use_fast_accum=[False],
+                group_name=self.tp.device_group.group_name,
+            )
+            return mm_outputs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
 class AsyncTPPass(VllmPatternMatcherPass):
 
     @enable_fake_mode
@@ -394,6 +514,13 @@ def __init__(self, config: VllmConfig):
             AllGatherCutlassScaledMMPattern(
                 self.model_dtype, self.device).register(self.patterns)
 
+            # Patterns for FP8 AllGather (after FP8AllGatherOptPass)
+            # These enable AsyncTP-style fusion on the optimized FP8 path
+            AllGatherFP8ScaledMMPattern(self.model_dtype,
+                                        self.device).register(self.patterns)
+            AllGatherFP8CutlassScaledMMPattern(
+                self.model_dtype, self.device).register(self.patterns)
+
         self.dump_patterns(config, self.patterns)
 
     def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
@@ -92,12 +92,13 @@ def configure(self, config: VllmConfig):
 
         if self.pass_config.enable_sequence_parallelism:
             self.passes += [SequenceParallelismPass(config)]
+            # FP8AllGatherOptPass must run BEFORE AsyncTPPass so that
+            # AsyncTPPass can fuse vllm_all_gather_fp8 + ScaledMM
+            if self.pass_config.enable_fp8_allgather_opt:
+                self.passes += [FP8AllGatherOptPass(config)]
             if self.pass_config.enable_async_tp:
                 self.passes += [AsyncTPPass(config)]
 
-        if self.pass_config.enable_fp8_allgather_opt:
-            self.passes += [FP8AllGatherOptPass(config)]
-
         if self.pass_config.enable_fi_allreduce_fusion:
             self.passes += [AllReduceFusionPass(config)]