Skip to content

Commit 302962e

Browse files
authored
[Bugfix] Skip dead and non-GPU nodes for Ray DP engine allocation (#22275)
Signed-off-by: Rui Qiao <[email protected]>
1 parent 7e6544c commit 302962e

File tree

1 file changed

+11
-2
lines changed

1 file changed

+11
-2
lines changed

vllm/v1/engine/utils.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -297,10 +297,10 @@ def create_dp_placement_groups(
297297
local_engine_count = \
298298
vllm_config.parallel_config.data_parallel_size_local
299299

300-
nodes = sorted(list_nodes(),
300+
nodes = sorted(list_nodes(filters=[("state", "=", "ALIVE")]),
301301
key=lambda node: node.node_ip != dp_master_ip)
302302
assert nodes[0].node_ip == dp_master_ip, (
303-
"The first node must be the head node")
303+
"The head node is missing or dead")
304304
assert len(nodes) == 1 or nodes[1].node_ip != dp_master_ip, (
305305
"There can only be one head node")
306306

@@ -312,6 +312,8 @@ def create_dp_placement_groups(
312312
for node in nodes:
313313
node_ip = node.node_ip
314314
node_resources = available_resources[node.node_id]
315+
if "GPU" not in node_resources:
316+
continue
315317
# For now, each DP rank can only be assigned to one node
316318
# TODO(rui): support allocating a single DP rank
317319
# to multiple nodes
@@ -346,6 +348,13 @@ def create_dp_placement_groups(
346348
)
347349
placement_groups.append(pg)
348350
local_dp_ranks.append(i)
351+
if len(placement_groups) < num_pg_to_create:
352+
raise ValueError(
353+
f"Not enough resources to allocate {num_pg_to_create} "
354+
"placement groups, only created "
355+
f"{len(placement_groups)} placement groups. "
356+
"Available resources: "
357+
f"{available_resources}")
349358
return placement_groups, local_dp_ranks
350359

351360
@staticmethod

0 commit comments

Comments
 (0)