vllm-project
diff --git a/‎experimental/bench.sh‎
Lines changed: 16 additions & 0 deletions b/‎experimental/bench.sh‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎experimental/scale.sh‎
Lines changed: 5 additions & 0 deletions b/‎experimental/scale.sh‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎experimental/serve.sh‎
Lines changed: 49 additions & 0 deletions b/‎experimental/serve.sh‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎vllm/config/parallel.py‎
Lines changed: 47 additions & 1 deletion b/‎vllm/config/parallel.py‎
Lines changed: 47 additions & 1 deletion
diff --git a/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 25 additions & 21 deletions b/‎vllm/distributed/device_communicators/all2all.py‎
Lines changed: 25 additions & 21 deletions
diff --git a/‎vllm/distributed/device_communicators/base_device_communicator.py‎
Lines changed: 30 additions & 10 deletions b/‎vllm/distributed/device_communicators/base_device_communicator.py‎
Lines changed: 30 additions & 10 deletions
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# MODEL_NAME="deepseek-ai/DeepSeek-V3.1"
+MODEL_NAME="Qwen/Qwen3-30B-A3B-Thinking-2507-FP8"
+# MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507-FP8"
+HOST="localhost"
+PORT=8006
+
+vllm bench serve \
+    --model $MODEL_NAME \
+    --host $HOST \
+    --port $PORT \
+    --dataset-name random \
+    --random-input-len 128 \
+    --random-output-len 128 \
+    --num-prompts 512
@@ -0,0 +1,5 @@
+#!/bin/bash
+HOST="localhost"
+PORT=8006
+
+python examples/online_serving/elastic_ep/scale.py --host $HOST --port $PORT --new-dp-size 4
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# MODEL_NAME="deepseek-ai/DeepSeek-V3.1"
+MODEL_NAME="Qwen/Qwen3-30B-A3B-Thinking-2507-FP8"
+# MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507-FP8"
+HOST="0.0.0.0"
+PORT=8006
+
+DATA_PARALLEL_SIZE=2
+DATA_PARALLEL_SIZE_LOCAL=2
+LEADER_ADDRESS="192.168.5.45"
+# LEADER_ADDRESS="172.18.0.3"
+
+NUM_REDUNDANT_EXPERTS=16
+EPLB_WINDOW_SIZE=1000
+EPLB_STEP_INTERVAL=3000
+MAX_MODEL_LEN=16384
+GPU_MEMORY_UTILIZATION=0.9
+
+export DG_JIT_NVCC_COMPILER=/usr/local/cuda-12.8/bin/nvcc
+export CUDA_HOME='/usr/local/cuda-12.8'
+
+export VLLM_USE_V1=1
+export VLLM_ALL2ALL_BACKEND="pplx"
+# export VLLM_ALL2ALL_BACKEND="deepep_low_latency"
+export VLLM_USE_DEEP_GEMM=1
+# export VLLM_ATTENTION_BACKEND="TRITON_MLA"
+
+# Launch the vLLM server
+vllm serve $MODEL_NAME --trust-remote-code \
+    --disable-log-requests \
+    --host $HOST \
+    --port $PORT \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
+    --max-model-len $MAX_MODEL_LEN \
+    --no-enable-prefix-caching \
+    --enable-expert-parallel \
+    --enable-elastic-ep \
+    --enable-eplb \
+    --eplb-config.num_redundant_experts $NUM_REDUNDANT_EXPERTS \
+    --eplb-config.window_size $EPLB_WINDOW_SIZE \
+    --eplb-config.step_interval $EPLB_STEP_INTERVAL \
+    --data-parallel-backend ray \
+    --data-parallel-size $DATA_PARALLEL_SIZE \
+    --data-parallel-size-local $DATA_PARALLEL_SIZE_LOCAL \
+    --data-parallel-address $LEADER_ADDRESS \
+    --data-parallel-rpc-port 9876 \
+    --data-parallel-start-rank 0
@@ -138,6 +138,9 @@ class ParallelConfig:
     disable_custom_all_reduce: bool = False
     """Disable the custom all-reduce kernel and fall back to NCCL."""
 
+    enable_elastic_ep: bool = False
+    """Enable elastic expert parallelism with stateless NCCL groups for DP/EP."""
+
     enable_dbo: bool = False
     """Enable dual batch overlap for the model executor."""
 
@@ -199,6 +202,21 @@ class is dynamically inherited by the worker class. This is used to inject
     Set to be private as it's not intended to be configured by users.
     """
 
+    _stateless_world_group_port_list: list[int] = field(default_factory=list)
+    """List of open ports for stateless world group when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    """
+
+    _stateless_dp_group_port_list: list[int] = field(default_factory=list)
+    """List of open ports for stateless DP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    """
+
+    _stateless_ep_group_port_list: list[int] = field(default_factory=list)
+    """List of open ports for stateless EP groups when enable_elastic_ep is True.
+    Set to be private as it's not intended to be configured by users.
+    """
+
     decode_context_parallel_size: int = 1
     """Number of decode context parallel groups, because the world size does
     not change by dcp, it simply reuse the GPUs of TP group, and tp_size
@@ -246,7 +264,16 @@ def get_next_dp_init_port(self) -> int:
 
         return answer
 
-    def stateless_init_dp_group(self) -> ProcessGroup:
+    def get_next_stateless_world_group_port(self) -> list[int]:
+        return self._stateless_world_group_port_list.pop(0)
+
+    def get_next_stateless_dp_group_port(self) -> list[int]:
+        return self._stateless_dp_group_port_list.pop(0)
+
+    def get_next_stateless_ep_group_port(self) -> list[int]:
+        return self._stateless_ep_group_port_list.pop(0)
+
+    def stateless_init_dp_group(self, return_store: bool = False) -> ProcessGroup:
         # NOTE: In high-concurrency scenarios multiple processes
         # can pick the same (currently free) port through a race
         # condition when calling `get_open_port()`. When the first
@@ -271,6 +298,7 @@ def stateless_init_dp_group(self) -> ProcessGroup:
                     self.data_parallel_rank,
                     self.data_parallel_size,
                     backend="gloo",
+                    return_store=return_store
                 )
             except DistNetworkError as e:
                 # We only want to retry when the root cause is EADDRINUSE.
@@ -387,6 +415,24 @@ def __post_init__(self) -> None:
             logger.info("Using external launcher for distributed inference.")
             self.world_size *= self.data_parallel_size
 
+        # Initialize stateless group ports for elastic EP
+        if self.enable_elastic_ep:
+            num_world_groups = 1
+            num_dp_groups = max(1, self.world_size_across_dp // self.data_parallel_size)
+            num_ep_groups = max(1, self.world_size_across_dp // (self.data_parallel_size * self.tensor_parallel_size))
+
+            total_ports_needed = (num_world_groups + num_dp_groups + num_ep_groups) * 3
+
+            if not self._stateless_world_group_port_list:
+                all_ports = get_open_ports_list(total_ports_needed + 5)
+                self._data_parallel_master_port_list = all_ports[-5:]
+                all_ports = all_ports[:-5]
+                self._stateless_world_group_port_list = [all_ports[i:i+3] for i in range(0, num_world_groups * 3, 3)]
+                start_idx = num_world_groups * 3
+                self._stateless_dp_group_port_list = [all_ports[i:i+3] for i in range(start_idx, start_idx + num_dp_groups * 3, 3)]
+                start_idx += num_dp_groups * 3
+                self._stateless_ep_group_port_list = [all_ports[i:i+3] for i in range(start_idx, start_idx + num_ep_groups * 3, 3)]
+
         if self.data_parallel_size_local > self.data_parallel_size:
             raise ValueError(
                 f"data_parallel_size_local ({self.data_parallel_size_local}) "
 
@@ -30,8 +30,8 @@ class NaiveAll2AllManager(All2AllManagerBase):
     debugging.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def naive_multicast(
         self,
@@ -101,8 +101,8 @@ class AgRsAll2AllManager(All2AllManagerBase):
     all-gather (dispatch) and reduce-scatter (combine).
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def dispatch(
         self,
@@ -145,13 +145,16 @@ class PPLXAll2AllManager(All2AllManagerBase):
     All2All communication based on PPLX kernels.
     """
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_pplx(), (
             "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
             " to install pplx_kernels."
         )
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
+        self.nvshmem_initialized = False
+        self.handle_cache = Cache()
 
+    def get_handle(self, kwargs):
         if self.internode:
             # inter-node communication needs nvshmem,
             # intra-node communication uses p2p mapping directly
@@ -171,17 +174,18 @@ def __init__(self, cpu_group):
                 if self.rank == 0
                 else nvshmem_alloc_empty_unique_id()
             )
-            dist.broadcast(
-                uid,
-                src=dist.get_process_group_ranks(self.cpu_group)[0],
-                group=self.cpu_group,
-            )
+            if self.tcp_store_group is not None:
+                uid = self.tcp_store_group.broadcast_obj(uid, src=0)
+            else:
+                dist.broadcast(
+                    uid,
+                    src=dist.get_process_group_ranks(self.cpu_group)[0],
+                    group=self.cpu_group,
+                )
             logger.debug("PPLX NVSHMEM UID = %s", uid)
             nvshmem_init(uid, self.rank, self.world_size)
-
-        self.handle_cache = Cache()
-
-    def get_handle(self, kwargs):
+            self.nvshmem_initialized = True
+            
         import pplx_kernels as pplx
 
         return self.handle_cache.get_or_create(
@@ -219,12 +223,12 @@ class DeepEPAll2AllManagerBase(All2AllManagerBase):
     All2All communication based on DeepEP High-Throughput kernels.
     """
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         assert has_deep_ep(), (
             "DeepEP kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md"
             " to install DeepEP kernels."
         )  # noqa
-        super().__init__(cpu_group)
+        super().__init__(cpu_group, tcp_store_group)
         self.handle_cache = Cache()
 
         # This is the DeepEP default. Stick to it till we can establish
@@ -256,8 +260,8 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
     All2All communication based on DeepEP High-Throughput kernels.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def _make_all2all_kwargs(self) -> dict[Any, Any]:
         # Defaults for internode and intranode are taken from DeepEP tests.
@@ -313,8 +317,8 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
     All2All communication based on DeepEP Low-Latency kernels.
     """
 
-    def __init__(self, cpu_group):
-        super().__init__(cpu_group)
+    def __init__(self, cpu_group, tcp_store_group=None):
+        super().__init__(cpu_group, tcp_store_group)
 
     def _make_all2all_kwargs(
         self,
 
@@ -30,8 +30,9 @@ class All2AllManagerBase:
     rank: int
     world_size: int
 
-    def __init__(self, cpu_group):
+    def __init__(self, cpu_group, tcp_store_group=None):
         self.cpu_group = cpu_group
+        self.tcp_store_group = tcp_store_group
 
         # compute some common properties
         from vllm.distributed.parallel_state import (
@@ -48,12 +49,15 @@ def __init__(self, cpu_group):
         # when we create this object
         self.dp_rank = self.dp_group.rank_in_group
         self.dp_world_size = self.dp_group.world_size
-        self.rank = dist.get_rank(cpu_group)
-        self.world_size = dist.get_world_size(cpu_group)
+        self.rank = cpu_group.rank()
+        self.world_size = cpu_group.size()
 
         # all2all communication often has separate implementations for
         # intra-node and inter-node communication
-        self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
+        if tcp_store_group is None:
+            self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0))
+        else:
+            self.internode = not all(in_the_same_node_as(tcp_store_group, source_rank=0))
 
     def get_handle(self, kwargs):
         # get a handle for the all2all communication,
@@ -99,17 +103,33 @@ def __init__(
         device: Optional[torch.device] = None,
         device_group: Optional[ProcessGroup] = None,
         unique_name: str = "",
+        global_ranks: Optional[list[int]] = None,
+        global_world_size: Optional[int] = None
     ):
         self.device = device or torch.device("cpu")
         self.cpu_group = cpu_group
         self.device_group = device_group
         self.unique_name = unique_name
-        self.rank = dist.get_rank(cpu_group)
-        self.world_size = dist.get_world_size(cpu_group)
-        self.ranks = dist.get_process_group_ranks(cpu_group)
-        self.global_rank = dist.get_rank()
-        self.global_world_size = dist.get_world_size()
-        self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank)
+
+        # Check if this is a stateless process group
+        from torch.distributed.distributed_c10d import _world
+        is_stateless = _world.pg_map.get(cpu_group, None) is None
+
+        if is_stateless:
+            # For stateless groups, we can't use torch.distributed methods
+            self.rank = cpu_group.rank()
+            self.world_size = cpu_group.size()
+            self.ranks = global_ranks
+            self.global_rank = self.ranks[self.rank]
+            self.global_world_size = global_world_size
+            self.rank_in_group = self.rank
+        else:
+            self.rank = dist.get_rank(cpu_group)
+            self.world_size = dist.get_world_size(cpu_group)
+            self.ranks = dist.get_process_group_ranks(cpu_group)
+            self.global_rank = dist.get_rank()
+            self.global_world_size = dist.get_world_size()
+            self.rank_in_group = dist.get_group_rank(self.cpu_group, self.global_rank)
 
         use_ep = False
         from vllm.config import get_current_vllm_config