vllm-project
diff --git a/‎tests/compile/test_fp8_allgather.py‎
Lines changed: 217 additions & 0 deletions b/‎tests/compile/test_fp8_allgather.py‎
Lines changed: 217 additions & 0 deletions
diff --git a/‎vllm/compilation/fp8_allgather_pass.py‎
Lines changed: 143 additions & 0 deletions b/‎vllm/compilation/fp8_allgather_pass.py‎
Lines changed: 143 additions & 0 deletions
@@ -0,0 +1,217 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+from ..utils import multi_gpu_test
+
+if not current_platform.is_cuda():
+    pytest.skip("CUDA only test", allow_module_level=True)
+
+
+def test_nccl_fp8_dtype_support():
+    """Test that NCCL wrapper supports FP8 datatypes"""
+    from vllm.distributed.device_communicators.pynccl_wrapper import (
+        ncclDataTypeEnum)
+
+    # Test FP8 E4M3
+    assert hasattr(ncclDataTypeEnum, 'ncclFp8E4M3')
+    assert ncclDataTypeEnum.ncclFp8E4M3 == 10
+
+    # Test FP8 E5M2
+    assert hasattr(ncclDataTypeEnum, 'ncclFp8E5M2')
+    assert ncclDataTypeEnum.ncclFp8E5M2 == 11
+
+    # Test from_torch mapping
+    assert ncclDataTypeEnum.from_torch(
+        torch.float8_e4m3fn) == ncclDataTypeEnum.ncclFp8E4M3
+    assert ncclDataTypeEnum.from_torch(
+        torch.float8_e5m2) == ncclDataTypeEnum.ncclFp8E5M2
+
+
+def test_custom_ops_registered():
+    """Test that custom FP8 ops are registered"""
+    # Import to trigger registration
+
+    # Check that ops are registered
+    assert hasattr(torch.ops.vllm, 'vllm_quantize_fp8')
+    assert hasattr(torch.ops.vllm, 'vllm_all_gather_fp8')
+
+    # Check that default variants exist
+    assert hasattr(torch.ops.vllm.vllm_quantize_fp8, 'default')
+    assert hasattr(torch.ops.vllm.vllm_all_gather_fp8, 'default')
+
+
+def test_fp8_quantization_op():
+    """Test FP8 quantization custom op"""
+    from vllm.compilation.fp8_collective_ops import vllm_quantize_fp8
+
+    # Create test tensor
+    x = torch.randn(16, 32, dtype=torch.bfloat16, device='cuda')
+
+    # Quantize
+    x_fp8, scale_inv = vllm_quantize_fp8(x)
+
+    # Check output types
+    assert x_fp8.dtype == torch.float8_e4m3fn
+    assert scale_inv.dtype == torch.float32
+
+    # Check shapes
+    assert x_fp8.shape == x.shape
+    assert scale_inv.numel() == 1  # per-tensor scale
+
+    # Check dequantization (approximately recovers original)
+    x_dequant = x_fp8.to(torch.bfloat16) * scale_inv
+    torch.testing.assert_close(x_dequant, x, rtol=0.1, atol=0.1)
+
+
+def fp8_allgather_worker(local_rank: int, world_size: int):
+    """Worker function for multi-GPU FP8 AllGather test"""
+    from vllm.compilation.fp8_collective_ops import vllm_all_gather_fp8
+    from vllm.distributed import (get_tp_group, init_distributed_environment,
+                                  initialize_model_parallel)
+    from vllm.utils import update_environment_variables
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '29501',
+    })
+
+    # Initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create test tensor (generate as BF16 then convert to FP8)
+    x = torch.randn(8, 16, dtype=torch.bfloat16,
+                    device='cuda').to(torch.float8_e4m3fn)
+
+    # All-gather
+    tp_group = get_tp_group()
+    gathered = vllm_all_gather_fp8(x,
+                                   dim=0,
+                                   world_size=tp_group.world_size,
+                                   group_name=tp_group.unique_name)
+
+    # Check shape
+    expected_shape = (8 * tp_group.world_size, 16)
+    assert gathered.shape == expected_shape
+    print(
+        f"Rank {local_rank}: ✅ FP8 AllGather op test passed! Shape: {gathered.shape}"
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+def test_fp8_allgather_op():
+    """Test FP8 all-gather custom op (requires multi-GPU)"""
+
+    def run_torch_spawn(fn, nprocs):
+        torch.multiprocessing.spawn(fn, args=(nprocs, ), nprocs=nprocs)
+
+    run_torch_spawn(fp8_allgather_worker, 2)
+
+
+def test_fp8_allgather_pass_init():
+    """Test FP8 AllGather pass initialization"""
+    pytest.skip(
+        "Requires distributed initialization - test manually with multi-GPU")
+
+
+def test_fp8_allgather_pattern_fake():
+    """Test pattern with fake mode (no actual distributed execution)"""
+    pytest.skip(
+        "Pattern registration requires valid TP group - test manually with multi-GPU"
+    )
+
+
+def fp8_allgather_correctness_worker(local_rank: int, world_size: int):
+    """Worker function for FP8 AllGather numerical correctness test"""
+    from vllm.compilation.fp8_collective_ops import (vllm_all_gather_fp8,
+                                                     vllm_quantize_fp8)
+    from vllm.distributed import (get_tp_group, init_distributed_environment,
+                                  initialize_model_parallel,
+                                  tensor_model_parallel_all_gather)
+    from vllm.utils import update_environment_variables
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '29502',
+    })
+
+    # Initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # Create test tensor
+    x = torch.randn(16, 32, dtype=torch.bfloat16, device='cuda')
+
+    # Method 1: Direct AllGather (baseline, default dim=-1)
+    gathered_direct = tensor_model_parallel_all_gather(x)
+
+    # Method 2: FP8 Optimized AllGather (use same dim=-1)
+    x_fp8, scale_inv = vllm_quantize_fp8(x)
+    tp_group = get_tp_group()
+    gathered_fp8 = vllm_all_gather_fp8(x_fp8,
+                                       dim=-1,
+                                       world_size=tp_group.world_size,
+                                       group_name=tp_group.unique_name)
+
+    # All-gather scales (reshape scalar to 1D first)
+    scale_inv_1d = scale_inv.view(1)
+    scale_gathered = tensor_model_parallel_all_gather(scale_inv_1d, dim=0)
+
+    # Dequantize: apply each rank's scale to its chunk
+    # gathered_fp8 has shape [16, 32*world_size], scale_gathered has shape [world_size]
+    # Need to broadcast scale to match each chunk along dim=-1
+    chunk_size = x.shape[-1]
+    scale_expanded = torch.repeat_interleave(scale_gathered, chunk_size).view(
+        1, -1).to(torch.bfloat16)
+    gathered_opt = gathered_fp8.to(torch.bfloat16) * scale_expanded
+
+    # Check correctness (allow for FP8 quantization error)
+    torch.testing.assert_close(gathered_opt,
+                               gathered_direct,
+                               rtol=0.05,
+                               atol=0.05)
+    print(
+        f"Rank {local_rank}: ✅ FP8 AllGather numerical correctness test passed!"
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+def test_fp8_allgather_numerical_correctness():
+    """Test end-to-end numerical correctness of FP8 AllGather optimization"""
+
+    def run_torch_spawn(fn, nprocs):
+        torch.multiprocessing.spawn(fn, args=(nprocs, ), nprocs=nprocs)
+
+    run_torch_spawn(fp8_allgather_correctness_worker, 2)
+
+
+def test_pass_config_has_flag():
+    """Test that PassConfig has enable_fp8_allgather_opt flag"""
+    from vllm.config import PassConfig
+
+    config = PassConfig(enable_fp8_allgather_opt=True)
+    assert config.enable_fp8_allgather_opt is True
+
+    config = PassConfig(enable_fp8_allgather_opt=False)
+    assert config.enable_fp8_allgather_opt is False
+
+    # Default should be False
+    config = PassConfig()
+    assert config.enable_fp8_allgather_opt is False
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+import torch._inductor.pattern_matcher as pm
+import torch.fx as fx
+from torch._inductor.pattern_matcher import PatternMatcherPass
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.logger import init_logger
+
+from .fp8_collective_ops import vllm_all_gather_fp8
+from .inductor_pass import enable_fake_mode
+from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass
+
+logger = init_logger(__name__)
+
+
+class AllGatherFP8Pattern:
+    """Optimize AllGather + FP8 quantization by quantizing before AllGather
+
+    Matches: AllGather(BF16) -> input_to_float8()
+    Where input_to_float8 decomposes into:
+        aminmax -> abs -> max -> clamp -> div -> mul -> clamp -> to(fp8)
+    """
+
+    def __init__(self, device: str, dtype: torch.dtype, tp_size: int,
+                 tp_group_name: str):
+        self.device = device
+        self.dtype = dtype
+        self.tp_size = tp_size
+        self.tp_group_name = tp_group_name
+        self.fp8_dtype = torch.float8_e4m3fn
+
+    def get_inputs(self):
+        # BF16 tensor that will be all-gathered, then quantized to FP8
+        x = torch.empty([8, 16], device=self.device, dtype=self.dtype)
+        # Precomputed FP8 scale (scalar)
+        scale = torch.empty([], device=self.device, dtype=torch.float32)
+        return [x, scale]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+            # Match: AllGather(BF16) -> modelopt FP8 quantization
+            # This matches what's in the FX graph from modelopt quant
+            gathered_bf16 = torch.ops.vllm.all_gather.default(
+                x,
+                dim=0,  # Actual dimension used in the graph
+                world_size=self.tp_size,
+                group_name=self.tp_group_name,
+            )
+
+            # Modelopt quantization pattern (uses precomputed scale):
+            # convert to fp32 -> multiply by 1/scale -> clamp -> convert to fp8
+            x_f32 = gathered_bf16.to(torch.float32)
+            scale_inv = scale.reciprocal()
+            x_scaled = x_f32 * scale_inv
+            x_clamped = x_scaled.clamp(min=-448.0, max=448.0)
+            gathered_fp8 = x_clamped.to(self.fp8_dtype)
+
+            return gathered_fp8
+
+        def replacement(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
+            # Step 1: Quantize to FP8 locally BEFORE AllGather
+            # Use the same modelopt quantization logic
+            x_f32 = x.to(torch.float32)
+            scale_inv = scale.reciprocal()
+            x_scaled = x_f32 * scale_inv
+            x_clamped = x_scaled.clamp(min=-448.0, max=448.0)
+            x_fp8 = x_clamped.to(self.fp8_dtype)
+
+            # Step 2: AllGather FP8 tensors (2x less bandwidth!)
+            gathered_fp8 = vllm_all_gather_fp8(
+                x_fp8,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp_group_name,
+            )
+
+            return gathered_fp8
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class FP8AllGatherOptPass(VllmPatternMatcherPass):
+    """Optimize AllGather by quantizing to FP8 first (2x bandwidth reduction)"""
+
+    @enable_fake_mode
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        self.disabled = False  # Initialize disabled flag
+        self.tp_size = get_tensor_model_parallel_world_size()
+        if self.tp_size <= 1:
+            self.disabled = True
+            logger.info(
+                "FP8 AllGather optimization disabled: TP size = %d "
+                "(no communication needed)", self.tp_size)
+            return
+
+        from vllm.distributed import get_tp_group
+        self.tp_group_name = get_tp_group().unique_name
+
+        self.patterns = PatternMatcherPass(pass_name="fp8_allgather_opt_pass")
+
+        # Only apply to BF16 models (FP8 requires BF16 output dtype)
+        if self.model_dtype == torch.bfloat16:
+            AllGatherFP8Pattern(
+                self.device,
+                self.model_dtype,
+                self.tp_size,
+                self.tp_group_name,
+            ).register(self.patterns)
+            logger.info(
+                "FP8 AllGather optimization enabled: "
+                "TP size = %d, dtype = %s", self.tp_size, self.model_dtype)
+        else:
+            self.disabled = True
+            logger.info(
+                "FP8 AllGather optimization disabled: "
+                "model dtype = %s (requires BF16)", self.model_dtype)
+
+        if not self.disabled:
+            self.dump_patterns(config, self.patterns)
+
+    @VllmInductorPass.time_and_log
+    def __call__(self, graph: fx.Graph):
+        if getattr(self, 'disabled', False):
+            return
+
+        self.matched_count = self.patterns.apply(graph)
+        if self.matched_count > 0:
+            logger.info(
+                "FP8 AllGather optimization: replaced %d AllGather "
+                "operation(s) with FP8 quantized versions",
+                self.matched_count)
+        else:
+            logger.debug(
+                "FP8 AllGather optimization: "
+                "no matching patterns found in graph")