diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7abaffa54c08..93c2d7f0e9ce 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -8,6 +8,7 @@ import msgspec +import vllm.envs as envs import vllm.platforms from vllm.config import ParallelConfig from vllm.executor.msgspec_utils import decode_hook, encode_hook @@ -338,6 +339,7 @@ def initialize_ray_cluster( else: logger.info("No current placement group found. " "Creating a new placement group.") + device_resource_request = envs.VLLM_RAY_PER_WORKER_GPUS num_devices_in_cluster = ray.cluster_resources().get(device_str, 0) # Log a warning message and delay resource allocation failure response. # Avoid immediate rejection to allow user-initiated placement group @@ -349,7 +351,8 @@ def initialize_ray_cluster( device_str) # Create a new placement group placement_group_specs: List[Dict[str, float]] = ([{ - device_str: 1.0 + device_str: + device_resource_request } for _ in range(parallel_config.world_size)]) # vLLM engine is also a worker to execute model with an accelerator, @@ -358,12 +361,13 @@ def initialize_ray_cluster( current_ip = get_ip() current_node_id = ray.get_runtime_context().get_node_id() current_node_resource = available_resources_per_node()[current_node_id] - if current_node_resource.get(device_str, 0) < 1: + if current_node_resource.get(device_str, 0) < device_resource_request: raise ValueError( f"Current node has no {device_str} available. " f"{current_node_resource=}. vLLM engine cannot start without " - f"{device_str}. Make sure you have at least 1 {device_str} " - f"available in a node {current_node_id=} {current_ip=}.") + f"{device_str}. Make sure you have at least {device_resource_request} " + f"{device_str} available in a node {current_node_id=} {current_ip=}." + ) # This way, at least bundle is required to be created in a current # node. placement_group_specs[0][f"node:{current_ip}"] = 0.001