@@ -297,10 +297,10 @@ def create_dp_placement_groups(
297
297
local_engine_count = \
298
298
vllm_config .parallel_config .data_parallel_size_local
299
299
300
- nodes = sorted (list_nodes (),
300
+ nodes = sorted (list_nodes (filters = [( "state" , "=" , "ALIVE" )] ),
301
301
key = lambda node : node .node_ip != dp_master_ip )
302
302
assert nodes [0 ].node_ip == dp_master_ip , (
303
- "The first node must be the head node " )
303
+ "The head node is missing or dead " )
304
304
assert len (nodes ) == 1 or nodes [1 ].node_ip != dp_master_ip , (
305
305
"There can only be one head node" )
306
306
@@ -312,6 +312,8 @@ def create_dp_placement_groups(
312
312
for node in nodes :
313
313
node_ip = node .node_ip
314
314
node_resources = available_resources [node .node_id ]
315
+ if "GPU" not in node_resources :
316
+ continue
315
317
# For now, each DP rank can only be assigned to one node
316
318
# TODO(rui): support allocating a single DP rank
317
319
# to multiple nodes
@@ -346,6 +348,13 @@ def create_dp_placement_groups(
346
348
)
347
349
placement_groups .append (pg )
348
350
local_dp_ranks .append (i )
351
+ if len (placement_groups ) < num_pg_to_create :
352
+ raise ValueError (
353
+ f"Not enough resources to allocate { num_pg_to_create } "
354
+ "placement groups, only created "
355
+ f"{ len (placement_groups )} placement groups. "
356
+ "Available resources: "
357
+ f"{ available_resources } " )
349
358
return placement_groups , local_dp_ranks
350
359
351
360
@staticmethod
0 commit comments