-
-
Notifications
You must be signed in to change notification settings - Fork 10.5k
[1/N] Elastic EP Milestone 2 #26278
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[1/N] Elastic EP Milestone 2 #26278
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
|
||
# MODEL_NAME="deepseek-ai/DeepSeek-V3.1" | ||
MODEL_NAME="Qwen/Qwen3-30B-A3B-Thinking-2507-FP8" | ||
# MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507-FP8" | ||
HOST="localhost" | ||
PORT=8006 | ||
|
||
vllm bench serve \ | ||
--model $MODEL_NAME \ | ||
--host $HOST \ | ||
--port $PORT \ | ||
--dataset-name random \ | ||
--random-input-len 128 \ | ||
--random-output-len 128 \ | ||
--num-prompts 512 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/bin/bash | ||
HOST="localhost" | ||
PORT=8006 | ||
|
||
python examples/online_serving/elastic_ep/scale.py --host $HOST --port $PORT --new-dp-size 4 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/bin/bash | ||
|
||
# MODEL_NAME="deepseek-ai/DeepSeek-V3.1" | ||
MODEL_NAME="Qwen/Qwen3-30B-A3B-Thinking-2507-FP8" | ||
# MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507-FP8" | ||
HOST="0.0.0.0" | ||
PORT=8006 | ||
|
||
DATA_PARALLEL_SIZE=2 | ||
DATA_PARALLEL_SIZE_LOCAL=2 | ||
LEADER_ADDRESS="192.168.5.45" | ||
# LEADER_ADDRESS="172.18.0.3" | ||
|
||
NUM_REDUNDANT_EXPERTS=16 | ||
EPLB_WINDOW_SIZE=1000 | ||
EPLB_STEP_INTERVAL=3000 | ||
MAX_MODEL_LEN=16384 | ||
GPU_MEMORY_UTILIZATION=0.9 | ||
|
||
export DG_JIT_NVCC_COMPILER=/usr/local/cuda-12.8/bin/nvcc | ||
export CUDA_HOME='/usr/local/cuda-12.8' | ||
|
||
export VLLM_USE_V1=1 | ||
export VLLM_ALL2ALL_BACKEND="pplx" | ||
# export VLLM_ALL2ALL_BACKEND="deepep_low_latency" | ||
export VLLM_USE_DEEP_GEMM=1 | ||
# export VLLM_ATTENTION_BACKEND="TRITON_MLA" | ||
|
||
# Launch the vLLM server | ||
vllm serve $MODEL_NAME --trust-remote-code \ | ||
--disable-log-requests \ | ||
--host $HOST \ | ||
--port $PORT \ | ||
--tensor-parallel-size 1 \ | ||
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \ | ||
--max-model-len $MAX_MODEL_LEN \ | ||
--no-enable-prefix-caching \ | ||
--enable-expert-parallel \ | ||
--enable-elastic-ep \ | ||
--enable-eplb \ | ||
--eplb-config.num_redundant_experts $NUM_REDUNDANT_EXPERTS \ | ||
--eplb-config.window_size $EPLB_WINDOW_SIZE \ | ||
--eplb-config.step_interval $EPLB_STEP_INTERVAL \ | ||
--data-parallel-backend ray \ | ||
--data-parallel-size $DATA_PARALLEL_SIZE \ | ||
--data-parallel-size-local $DATA_PARALLEL_SIZE_LOCAL \ | ||
--data-parallel-address $LEADER_ADDRESS \ | ||
--data-parallel-rpc-port 9876 \ | ||
--data-parallel-start-rank 0 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,8 +29,9 @@ def get_or_create(self, kwargs, func): | |
|
||
class All2AllManagerBase: | ||
|
||
def __init__(self, cpu_group): | ||
def __init__(self, cpu_group, tcp_store_group=None): | ||
self.cpu_group = cpu_group | ||
self.tcp_store_group = tcp_store_group | ||
|
||
# compute some common properties | ||
from vllm.distributed.parallel_state import (get_dp_group, | ||
|
@@ -44,12 +45,15 @@ def __init__(self, cpu_group): | |
# when we create this object | ||
self.dp_rank = self.dp_group.rank_in_group | ||
self.dp_world_size = self.dp_group.world_size | ||
self.rank = dist.get_rank(cpu_group) | ||
self.world_size = dist.get_world_size(cpu_group) | ||
self.rank = cpu_group.rank() | ||
self.world_size = cpu_group.size() | ||
|
||
# all2all communication often has separate implementations for | ||
# intra-node and inter-node communication | ||
self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0)) | ||
if tcp_store_group is None: | ||
self.internode = not all(in_the_same_node_as(cpu_group, source_rank=0)) | ||
else: | ||
self.internode = not all(in_the_same_node_as(tcp_store_group, source_rank=0)) | ||
|
||
def get_handle(self, kwargs): | ||
# get a handle for the all2all communication, | ||
|
@@ -83,18 +87,34 @@ def __init__(self, | |
cpu_group: ProcessGroup, | ||
device: Optional[torch.device] = None, | ||
device_group: Optional[ProcessGroup] = None, | ||
unique_name: str = ""): | ||
unique_name: str = "", | ||
global_ranks: Optional[list[int]] = None, | ||
global_world_size: Optional[int] = None): | ||
self.device = device or torch.device("cpu") | ||
self.cpu_group = cpu_group | ||
self.device_group = device_group | ||
self.unique_name = unique_name | ||
self.rank = dist.get_rank(cpu_group) | ||
self.world_size = dist.get_world_size(cpu_group) | ||
self.ranks = dist.get_process_group_ranks(cpu_group) | ||
self.global_rank = dist.get_rank() | ||
self.global_world_size = dist.get_world_size() | ||
self.rank_in_group = dist.get_group_rank(self.cpu_group, | ||
self.global_rank) | ||
|
||
# Check if this is a stateless process group | ||
from torch.distributed.distributed_c10d import _world | ||
is_stateless = _world.pg_map.get(cpu_group, None) is None | ||
Comment on lines
+98
to
+100
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The check |
||
|
||
if is_stateless: | ||
# For stateless groups, we can't use torch.distributed methods | ||
self.rank = cpu_group.rank() | ||
self.world_size = cpu_group.size() | ||
self.ranks = global_ranks | ||
self.global_rank = self.ranks[self.rank] | ||
self.global_world_size = global_world_size | ||
self.rank_in_group = self.rank | ||
else: | ||
self.rank = dist.get_rank(cpu_group) | ||
self.world_size = dist.get_world_size(cpu_group) | ||
self.ranks = dist.get_process_group_ranks(cpu_group) | ||
self.global_rank = dist.get_rank() | ||
self.global_world_size = dist.get_world_size() | ||
self.rank_in_group = dist.get_group_rank(self.cpu_group, | ||
self.global_rank) | ||
|
||
use_ep = False | ||
from vllm.config import get_current_vllm_config | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These methods use
pop(0)
to retrieve a port from a list without checking if the list is empty. If the port lists (_stateless_world_group_port_list
,_stateless_dp_group_port_list
,_stateless_ep_group_port_list
) are exhausted for any reason, this will raise anIndexError
and crash the process. While the logic in__post_init__
seems to pre-allocate the necessary ports, this design is fragile. A more robust implementation would be to check if the list is empty before popping and raise a more informative error message.