Skip to content

Commit 077f0a2

Browse files
authored
[Frontend] Enable support for CPU backend in AsyncLLMEngine. (#3993)
Signed-off-by: Tao He <[email protected]>
1 parent e73ed0f commit 077f0a2

File tree

2 files changed

+30
-2
lines changed

2 files changed

+30
-2
lines changed

vllm/engine/async_llm_engine.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,11 @@ def from_engine_args(
343343
if engine_config.device_config.device_type == "neuron":
344344
from vllm.executor.neuron_executor import NeuronExecutorAsync
345345
executor_class = NeuronExecutorAsync
346+
elif engine_config.device_config.device_type == "cpu":
347+
assert not engine_config.parallel_config.worker_use_ray, (
348+
"Ray is not supported with the CPU backend.")
349+
from vllm.executor.cpu_executor import CPUExecutorAsync
350+
executor_class = CPUExecutorAsync
346351
elif engine_config.parallel_config.worker_use_ray:
347352
initialize_ray_cluster(engine_config.parallel_config)
348353
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync

vllm/executor/cpu_executor.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
import torch
55

66
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
7-
from vllm.executor.executor_base import ExecutorBase
7+
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
88
from vllm.logger import init_logger
99
from vllm.lora.request import LoRARequest
1010
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
11-
from vllm.utils import get_distributed_init_method, get_ip, get_open_port
11+
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
12+
make_async)
1213

1314
logger = init_logger(__name__)
1415

@@ -100,6 +101,28 @@ def check_health(self) -> None:
100101
return
101102

102103

104+
class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
105+
106+
async def execute_model_async(
107+
self,
108+
seq_group_metadata_list: List[SequenceGroupMetadata],
109+
blocks_to_swap_in: Dict[int, int],
110+
blocks_to_swap_out: Dict[int, int],
111+
blocks_to_copy: Dict[int, List[int]],
112+
) -> SamplerOutput:
113+
output = await make_async(self.driver_worker.execute_model)(
114+
seq_group_metadata_list=seq_group_metadata_list,
115+
blocks_to_swap_in=blocks_to_swap_in,
116+
blocks_to_swap_out=blocks_to_swap_out,
117+
blocks_to_copy=blocks_to_copy)
118+
return output
119+
120+
async def check_health_async(self) -> None:
121+
# CPUExecutor will always be healthy as long as
122+
# it's running.
123+
return
124+
125+
103126
def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
104127
if config.dtype == torch.float16:
105128
logger.warning("float16 is not supported on CPU, casting to bfloat16.")

0 commit comments

Comments
 (0)