Skip to content

Commit 9b01870

Browse files
authored
[Bugfix] Fix cuda event usage with CPU model runner (#23643)
Signed-off-by: jiang1.li <[email protected]>
1 parent 44ac25e commit 9b01870

File tree

2 files changed

+26
-4
lines changed

2 files changed

+26
-4
lines changed

vllm/v1/worker/cpu_model_runner.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from vllm.model_executor.model_loader import get_model
1212
from vllm.v1.attention.backends.cpu_attn import TorchSDPAMetadataBuilderV1
1313
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
14+
from vllm.v1.worker.utils import CpuGpuBuffer
1415

1516
if TYPE_CHECKING:
1617
from vllm.v1.core.sched.output import SchedulerOutput
@@ -21,7 +22,8 @@
2122
class CPUModelRunner(GPUModelRunner):
2223

2324
def __init__(self, vllm_config: VllmConfig, device: torch.device):
24-
super().__init__(vllm_config, device)
25+
with _torch_cuda_wrapper():
26+
super().__init__(vllm_config, device)
2527

2628
assert device == torch.device("cpu")
2729
assert self.speculative_config is None, "spec decode is not supported."
@@ -71,8 +73,8 @@ def replace_tensor(obj: Any, cpu_attr_name: str,
7173
setattr(obj, device_attr_name, cpu_tensor)
7274

7375
for k, v in vars(self).items():
74-
if k.endswith("_cpu") and isinstance(v, torch.Tensor):
75-
replace_tensor(self, k, k[:-4])
76+
if isinstance(v, CpuGpuBuffer):
77+
v.gpu = v.cpu
7678

7779
for k, v in vars(self.input_batch).items():
7880
if k.endswith("_cpu_tensor") and isinstance(v, torch.Tensor):
@@ -108,6 +110,26 @@ def _init_device_properties(self) -> None:
108110
def _sync_device(self) -> None:
109111
pass
110112

113+
def _to_list(self, sampled_token_ids: torch.Tensor) -> list[list[int]]:
114+
return sampled_token_ids.tolist()
115+
116+
117+
@contextmanager
118+
def _torch_cuda_wrapper():
119+
120+
class _EventPlaceholder:
121+
122+
def __init__(self, *args, **kwargs) -> None:
123+
self.record = lambda: None
124+
self.synchronize = lambda: None
125+
126+
try:
127+
cuda_event = torch.cuda.Event
128+
torch.cuda.Event = _EventPlaceholder
129+
yield
130+
finally:
131+
torch.cuda.Event = cuda_event
132+
111133

112134
@contextmanager
113135
def _set_global_compilation_settings(config: VllmConfig):

vllm/v1/worker/gpu_model_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,7 @@ def __init__(
321321
(self.max_model_len, 1),
322322
dtype=torch.int64,
323323
device="cpu",
324-
pin_memory=True)
324+
pin_memory=self.pin_memory)
325325

326326
def _make_buffer(self, *args, dtype: torch.dtype) -> CpuGpuBuffer:
327327
return CpuGpuBuffer(*args,

0 commit comments

Comments
 (0)