Cleanup and improve FP8 AllGather optimization code

jasonlizhengjian · jasonlizhengjian · commit f08ef1f63e08 · 2025-10-05T16:37:23.000Z
- Remove unused vllm_quantize_fp8 custom op (leftover from experiments)
- Extract FP8_E4M3_MAX constant to eliminate magic numbers
- Add comprehensive docstrings explaining:
  - AllGatherFP8Pattern: transformation logic and pattern matching
  - FP8AllGatherOptPass: when optimization applies and pass ordering
  - vllm_all_gather_fp8: why separate op registration is needed
- Add comment explaining dim=0 limitation in tensor-parallel AllGather

This prepares the code for PR by removing experimental code and
improving documentation clarity.

Signed-off-by: jasonlizhengjian &lt;jasonlizhengjian@gmail.com&gt;
diff --git a/vllm/compilation/fp8_allgather_pass.py b/vllm/compilation/fp8_allgather_pass.py
@@ -16,13 +16,29 @@
 
 logger = init_logger(__name__)
 
+# Maximum representable value for FP8 E4M3 format
+FP8_E4M3_MAX = 448.0
 
-class AllGatherFP8Pattern:
-    """Optimize AllGather + FP8 quantization by quantizing before AllGather
 
-    Matches: AllGather(BF16) -> input_to_float8()
-    Where input_to_float8 decomposes into:
-        aminmax -> abs -> max -> clamp -> div -> mul -> clamp -> to(fp8)
+class AllGatherFP8Pattern:
+    """Optimize AllGather + FP8 quantization by quantizing before AllGather.
+
+    This pattern transforms:
+        AllGather(BF16) → Quantize(FP8)
+    into:
+        Quantize(FP8) → AllGather(FP8)
+
+    Benefits:
+    - Reduces AllGather communication bandwidth by 2x (BF16→FP8 is 16→8 bit)
+    - Numerically equivalent when using precomputed scales
+      (modelopt quantization)
+
+    Pattern Matching:
+    - Matches: AllGather(BF16) → modelopt's input_to_float8()
+    - Where input_to_float8 decomposes into:
+      to(fp32) → reciprocal(scale) → mul → clamp(-448, 448) → to(fp8)
+    - Only matches when the scale is precomputed (not computed from the
+      gathered tensor), ensuring the transformation is valid
     """
 
     def __init__(self, device: str, dtype: torch.dtype, tp_size: int,
@@ -47,7 +63,10 @@ def pattern(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
             # This matches what's in the FX graph from modelopt quant
             gathered_bf16 = torch.ops.vllm.all_gather.default(
                 x,
-                dim=0,  # Actual dimension used in the graph
+                # Only dim=0 is supported because tensor-parallel AllGather
+                # in vLLM always gathers along the sequence dimension (dim=0)
+                # for activation tensors in transformer layers.
+                dim=0,
                 world_size=self.tp_size,
                 group_name=self.tp_group_name,
             )
@@ -57,7 +76,7 @@ def pattern(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
             x_f32 = gathered_bf16.to(torch.float32)
             scale_inv = scale.reciprocal()
             x_scaled = x_f32 * scale_inv
-            x_clamped = x_scaled.clamp(min=-448.0, max=448.0)
+            x_clamped = x_scaled.clamp(min=-FP8_E4M3_MAX, max=FP8_E4M3_MAX)
             gathered_fp8 = x_clamped.to(self.fp8_dtype)
 
             return gathered_fp8
@@ -68,7 +87,7 @@ def replacement(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
             x_f32 = x.to(torch.float32)
             scale_inv = scale.reciprocal()
             x_scaled = x_f32 * scale_inv
-            x_clamped = x_scaled.clamp(min=-448.0, max=448.0)
+            x_clamped = x_scaled.clamp(min=-FP8_E4M3_MAX, max=FP8_E4M3_MAX)
             x_fp8 = x_clamped.to(self.fp8_dtype)
 
             # Step 2: AllGather FP8 tensors (2x less bandwidth!)
@@ -86,7 +105,24 @@ def replacement(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
 
 
 class FP8AllGatherOptPass(VllmPatternMatcherPass):
-    """Optimize AllGather by quantizing to FP8 first (2x bandwidth reduction)"""
+    """Optimize AllGather communication by quantizing to FP8 before gathering.
+
+    This compiler pass reduces tensor-parallel AllGather bandwidth by 2x by
+    transforming AllGather(BF16) → Quantize(FP8) into
+    Quantize(FP8) → AllGather(FP8).
+
+    The optimization is only applied when:
+    - Tensor parallelism is enabled (tp_size > 1)
+    - Model dtype is bfloat16 (required for FP8 output dtype)
+    - The pattern uses precomputed FP8 scales (e.g., from modelopt quantization)
+
+    This pass must run BEFORE AsyncTPPass so that AsyncTP can fuse the resulting
+    vllm_all_gather_fp8 ops with subsequent scaled matrix multiplications.
+
+    Configuration:
+    - Enabled via PassConfig.enable_fp8_allgather_opt
+    - Requires PassConfig.enable_sequence_parallelism to be enabled
+    """
 
     @enable_fake_mode
     def __init__(self, config: VllmConfig):
@@ -135,9 +171,7 @@ def __call__(self, graph: fx.Graph):
         if self.matched_count > 0:
             logger.info(
                 "FP8 AllGather optimization: replaced %d AllGather "
-                "operation(s) with FP8 quantized versions",
-                self.matched_count)
+                "operation(s) with FP8 quantized versions", self.matched_count)
         else:
-            logger.debug(
-                "FP8 AllGather optimization: "
-                "no matching patterns found in graph")
+            logger.debug("FP8 AllGather optimization: "
+                         "no matching patterns found in graph")
diff --git a/vllm/compilation/fp8_collective_ops.py b/vllm/compilation/fp8_collective_ops.py
@@ -1,35 +1,41 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Custom ops for FP8 collective operations.
+
+This module registers custom ops for FP8-optimized collective operations that
+enable pattern matching in torch.compile's FX graph. While the implementations
+are functionally identical to their non-FP8 counterparts, having separate op
+registrations allows the compiler to distinguish between BF16 and FP8 code paths
+for applying different fusion strategies.
+"""
 
 import torch
 
 from vllm.distributed import get_tp_group
-from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-    input_to_float8)
 from vllm.utils import direct_register_custom_op
 
 
-def vllm_quantize_fp8_impl(x: torch.Tensor) -> tuple[torch.Tensor,
-                                                      torch.Tensor]:
-    """Quantize tensor to FP8 with per-tensor scaling"""
-    return input_to_float8(x)
-
-
-def vllm_quantize_fp8_fake(x: torch.Tensor) -> tuple[torch.Tensor,
-                                                      torch.Tensor]:
-    """Fake implementation for torch.compile tracing"""
-    fp8_dtype = torch.float8_e4m3fn
-    scale = torch.tensor(1.0, dtype=torch.float32, device=x.device)
-    return x.to(fp8_dtype), scale
-
-
 def vllm_all_gather_fp8_impl(
     x: torch.Tensor,
     dim: int,
     world_size: int,
     group_name: str,
 ) -> torch.Tensor:
-    """All-gather FP8 tensor"""
+    """All-gather FP8 tensor across tensor-parallel group.
+
+    This is functionally identical to torch.ops.vllm.all_gather, but
+    is registered as a separate op to enable FP8-specific pattern matching
+    in the AsyncTP fusion pass.
+
+    Args:
+        x: Input FP8 tensor to gather (typically float8_e4m3fn)
+        dim: Dimension along which to gather (typically 0 for sequence dim)
+        world_size: Number of ranks in the tensor-parallel group
+        group_name: Name of the tensor-parallel process group
+
+    Returns:
+        Gathered tensor with shape expanded by world_size along dim
+    """
     return get_tp_group().all_gather(x, dim)
 
 
@@ -39,25 +45,17 @@ def vllm_all_gather_fp8_fake(
     world_size: int,
     group_name: str,
 ) -> torch.Tensor:
-    """Fake implementation - just replicate along dimension"""
+    """Fake implementation for torch.compile tracing."""
     return x.repeat_interleave(world_size, dim=dim)
 
 
-# Register custom ops
-direct_register_custom_op(
-    op_name="vllm_quantize_fp8",
-    op_func=vllm_quantize_fp8_impl,
-    mutates_args=[],
-    fake_impl=vllm_quantize_fp8_fake,
-)
-
+# Register custom op for FP8 AllGather
 direct_register_custom_op(
     op_name="vllm_all_gather_fp8",
     op_func=vllm_all_gather_fp8_impl,
     mutates_args=[],
     fake_impl=vllm_all_gather_fp8_fake,
 )
 
-# Export ops
-vllm_quantize_fp8 = torch.ops.vllm.vllm_quantize_fp8.default
+# Export op
 vllm_all_gather_fp8 = torch.ops.vllm.vllm_all_gather_fp8.default