vllm-project
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 17 additions & 5 deletions b/‎vllm/config/parallel.py‎
Lines changed: 17 additions & 5 deletions
diff --git a/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 3 additions & 3 deletions b/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎vllm/distributed/device_communicators/base_device_communicator.py‎
Lines changed: 17 additions & 2 deletions b/‎vllm/distributed/device_communicators/base_device_communicator.py‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎vllm/distributed/device_communicators/cuda_communicator.py‎
Lines changed: 28 additions & 9 deletions b/‎vllm/distributed/device_communicators/cuda_communicator.py‎
Lines changed: 28 additions & 9 deletions
diff --git a/‎vllm/distributed/device_communicators/pynccl.py‎
Lines changed: 13 additions & 3 deletions b/‎vllm/distributed/device_communicators/pynccl.py‎
Lines changed: 13 additions & 3 deletions
@@ -298,7 +298,7 @@ def stateless_init_dp_group(self, return_store: bool = False) -> ProcessGroup:
                     self.data_parallel_rank,
                     self.data_parallel_size,
                     backend="gloo",
-                    return_store=return_store
+                    return_store=return_store,
                 )
             except DistNetworkError as e:
                 # We only want to retry when the root cause is EADDRINUSE.
@@ -419,19 +419,31 @@ def __post_init__(self) -> None:
         if self.enable_elastic_ep:
             num_world_groups = 1
             num_dp_groups = max(1, self.world_size_across_dp // self.data_parallel_size)
-            num_ep_groups = max(1, self.world_size_across_dp // (self.data_parallel_size * self.tensor_parallel_size))
+            num_ep_groups = max(
+                1,
+                self.world_size_across_dp
+                // (self.data_parallel_size * self.tensor_parallel_size),
+            )
 
             total_ports_needed = (num_world_groups + num_dp_groups + num_ep_groups) * 3
 
             if not self._stateless_world_group_port_list:
                 all_ports = get_open_ports_list(total_ports_needed + 5)
                 self._data_parallel_master_port_list = all_ports[-5:]
                 all_ports = all_ports[:-5]
-                self._stateless_world_group_port_list = [all_ports[i:i+3] for i in range(0, num_world_groups * 3, 3)]
+                self._stateless_world_group_port_list = [
+                    all_ports[i : i + 3] for i in range(0, num_world_groups * 3, 3)
+                ]
                 start_idx = num_world_groups * 3
-                self._stateless_dp_group_port_list = [all_ports[i:i+3] for i in range(start_idx, start_idx + num_dp_groups * 3, 3)]
+                self._stateless_dp_group_port_list = [
+                    all_ports[i : i + 3]
+                    for i in range(start_idx, start_idx + num_dp_groups * 3, 3)
+                ]
                 start_idx += num_dp_groups * 3
-                self._stateless_ep_group_port_list = [all_ports[i:i+3] for i in range(start_idx, start_idx + num_ep_groups * 3, 3)]
+                self._stateless_ep_group_port_list = [
+                    all_ports[i : i + 3]
+                    for i in range(start_idx, start_idx + num_ep_groups * 3, 3)
+                ]
 
         if self.data_parallel_size_local > self.data_parallel_size:
             raise ValueError(
 
@@ -185,7 +185,7 @@ def get_handle(self, kwargs):
             logger.debug("PPLX NVSHMEM UID = %s", uid)
             nvshmem_init(uid, self.rank, self.world_size)
             self.nvshmem_initialized = True
-            
+
         import pplx_kernels as pplx
 
         return self.handle_cache.get_or_create(
@@ -381,11 +381,11 @@ class FlashInferAllToAllManager(All2AllManagerBase):
     All2All communication based on flashinfer kernels.
     """
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_flashinfer_all2all(), (
             "flashinfer all2all module not found. Please install/check flashinfer"
         )  # noqa
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
         logger.debug(
             "Initialize for flashinfer All2All rank=%d, world size=%d",
             self.rank,
 
@@ -57,7 +57,9 @@ def __init__(self, cpu_group, tcp_store_group=None):
         if tcp_store_group is None:
             self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
         else:
-            self.internode = not all(in_the_same_node_as(tcp_store_group, source_rank=0))
+            self.internode = not all(
+                in_the_same_node_as(tcp_store_group, source_rank=0)
+            )
 
     def get_handle(self, kwargs):
         # get a handle for the all2all communication,
@@ -104,7 +106,7 @@ def __init__(
         device_group: Optional[ProcessGroup] = None,
         unique_name: str = "",
         global_ranks: Optional[list[int]] = None,
-        global_world_size: Optional[int] = None
+        global_world_size: Optional[int] = None,
     ):
         self.device = device or torch.device("cpu")
         self.cpu_group = cpu_group
@@ -113,12 +115,15 @@ def __init__(
 
         # Check if this is a stateless process group
         from torch.distributed.distributed_c10d import _world
+
         is_stateless = _world.pg_map.get(cpu_group, None) is None
 
         if is_stateless:
             # For stateless groups, we can't use torch.distributed methods
             self.rank = cpu_group.rank()
             self.world_size = cpu_group.size()
+            assert global_ranks is not None
+            assert global_world_size is not None
             self.ranks = global_ranks
             self.global_rank = self.ranks[self.rank]
             self.global_world_size = global_world_size
@@ -270,6 +275,13 @@ def recv(
         torch.distributed.recv(tensor, self.ranks[src], self.device_group)
         return tensor
 
+    def broadcast(self, tensor: torch.Tensor, src: int = 0) -> torch.Tensor:
+        """Broadcast a tensor from source rank to all ranks."""
+        if self.world_size == 1:
+            return tensor
+        torch.distributed.broadcast(tensor, self.ranks[src], self.device_group)
+        return tensor
+
     def destroy(self):
         pass
 
@@ -313,3 +325,6 @@ def combine(
         This is a no-op in the base class.
         """
         return hidden_states
+
+    def batch_isend_irecv(self, p2p_ops: list):
+        raise NotImplementedError
@@ -17,8 +17,8 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
-from .base_device_communicator import DeviceCommunicatorBase
 from ..utils import StatelessProcessGroup
+from .base_device_communicator import DeviceCommunicatorBase
 
 logger = init_logger(__name__)
 
@@ -32,9 +32,16 @@ def __init__(
         unique_name: str = "",
         global_ranks: Optional[list[int]] = None,
         global_world_size: Optional[int] = None,
-        tcp_store_group: Optional[StatelessProcessGroup] = None
+        tcp_store_group: Optional[StatelessProcessGroup] = None,
     ):
-        super().__init__(cpu_group, device, device_group, unique_name, global_ranks, global_world_size)
+        super().__init__(
+            cpu_group,
+            device,
+            device_group,
+            unique_name,
+            global_ranks,
+            global_world_size,
+        )
         if "tp" not in unique_name:
             # custom allreduce or torch symm mem can be used only by tp
             use_custom_allreduce = False
@@ -99,32 +106,44 @@ def __init__(
             if all2all_backend == "naive":
                 from .all2all import NaiveAll2AllManager
 
-                self.all2all_manager = NaiveAll2AllManager(self.cpu_group, tcp_store_group=tcp_store_group)
+                self.all2all_manager = NaiveAll2AllManager(
+                    self.cpu_group, tcp_store_group=tcp_store_group
+                )
                 logger.info("Using naive all2all manager.")
             elif all2all_backend == "allgather_reducescatter":
                 from .all2all import AgRsAll2AllManager
 
-                self.all2all_manager = AgRsAll2AllManager(self.cpu_group, tcp_store_group=tcp_store_group)
+                self.all2all_manager = AgRsAll2AllManager(
+                    self.cpu_group, tcp_store_group=tcp_store_group
+                )
                 logger.info("Using AllGather-ReduceScatter all2all manager.")
             elif all2all_backend == "pplx":
                 from .all2all import PPLXAll2AllManager
 
-                self.all2all_manager = PPLXAll2AllManager(self.cpu_group, tcp_store_group=tcp_store_group)
+                self.all2all_manager = PPLXAll2AllManager(
+                    self.cpu_group, tcp_store_group=tcp_store_group
+                )
                 logger.info("Using PPLX all2all manager.")
             elif all2all_backend == "deepep_high_throughput":
                 from .all2all import DeepEPHTAll2AllManager
 
-                self.all2all_manager = DeepEPHTAll2AllManager(self.cpu_group, tcp_store_group=tcp_store_group)
+                self.all2all_manager = DeepEPHTAll2AllManager(
+                    self.cpu_group, tcp_store_group=tcp_store_group
+                )
                 logger.info("Using DeepEP High-Throughput all2all manager.")
             elif all2all_backend == "deepep_low_latency":
                 from .all2all import DeepEPLLAll2AllManager
 
-                self.all2all_manager = DeepEPLLAll2AllManager(self.cpu_group, tcp_store_group=tcp_store_group)
+                self.all2all_manager = DeepEPLLAll2AllManager(
+                    self.cpu_group, tcp_store_group=tcp_store_group
+                )
                 logger.info("Using DeepEP Low-Latency all2all manager.")
             elif all2all_backend == "flashinfer_all2allv":
                 from .all2all import FlashInferAllToAllManager
 
-                self.all2all_manager = FlashInferAllToAllManager(self.cpu_group, tcp_store_group=tcp_store_group)
+                self.all2all_manager = FlashInferAllToAllManager(
+                    self.cpu_group, tcp_store_group=tcp_store_group
+                )
                 logger.info("Using Flashinfer all2allv manager.")
             else:
                 raise ValueError(f"Unknown all2all backend: {all2all_backend}")
 
@@ -6,7 +6,7 @@
 # ===================== import region =====================
 import torch
 import torch.distributed as dist
-from torch.distributed import ProcessGroup, ReduceOp, P2POp
+from torch.distributed import ProcessGroup, ReduceOp
 
 import vllm.envs as envs
 from vllm.distributed.device_communicators.pynccl_wrapper import (
@@ -312,7 +312,12 @@ def send(self, tensor: torch.Tensor, dst: int, stream=None):
         )
         if stream is None:
             stream = current_stream()
-        if tensor.dtype in [torch.float8_e5m2, torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2fnuz]:
+        if tensor.dtype in [
+            torch.float8_e5m2,
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+        ]:
             nccl_dtype = ncclDataTypeEnum.from_torch(torch.uint8)
         else:
             nccl_dtype = ncclDataTypeEnum.from_torch(tensor.dtype)
@@ -334,7 +339,12 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None):
         )
         if stream is None:
             stream = current_stream()
-        if tensor.dtype in [torch.float8_e5m2, torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2fnuz]:
+        if tensor.dtype in [
+            torch.float8_e5m2,
+            torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+        ]:
             nccl_dtype = ncclDataTypeEnum.from_torch(torch.uint8)
         else:
             nccl_dtype = ncclDataTypeEnum.from_torch(tensor.dtype)