Skip to content

Commit c3372e8

Browse files
authored
Remove dependency on CuPy (#2152)
1 parent b0a1d66 commit c3372e8

File tree

7 files changed

+19
-208
lines changed

7 files changed

+19
-208
lines changed

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,3 @@ fastapi
1212
uvicorn[standard]
1313
pydantic == 1.10.13 # Required for OpenAI server.
1414
aioprometheus[starlette]
15-
cupy-cuda12x # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead. # FIXME: Fix this in setup.py.

vllm/engine/llm_engine.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
SequenceOutput, SequenceStatus)
1818
from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
1919
get_tokenizer)
20-
from vllm.utils import Counter, get_open_port
20+
from vllm.utils import Counter
2121

2222
if ray:
2323
from ray.air.util.torch_dist import init_torch_dist_process_group
@@ -190,7 +190,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
190190
))
191191
self._run_workers(
192192
"init_model",
193-
cupy_port=get_open_port(),
194193
get_all_outputs=True,
195194
)
196195
self._run_workers(

vllm/model_executor/parallel_utils/communication_op.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
import torch
22

3-
from vllm.model_executor.parallel_utils import cupy_utils
43
from vllm.model_executor.parallel_utils.parallel_state import (
54
get_tensor_model_parallel_world_size,
65
get_tensor_model_parallel_group,
7-
is_custom_nccl_enabled_for_all_reduce,
86
)
97

108

@@ -17,12 +15,8 @@ def tensor_model_parallel_all_reduce(input_):
1715
if get_tensor_model_parallel_world_size() == 1:
1816
return input_
1917
# All-reduce.
20-
if is_custom_nccl_enabled_for_all_reduce():
21-
# TODO: support multiple parallel groups.
22-
cupy_utils.all_reduce(input_)
23-
else:
24-
torch.distributed.all_reduce(input_,
25-
group=get_tensor_model_parallel_group())
18+
torch.distributed.all_reduce(input_,
19+
group=get_tensor_model_parallel_group())
2620
return input_
2721

2822

vllm/model_executor/parallel_utils/cupy_utils.py

Lines changed: 0 additions & 115 deletions
This file was deleted.

vllm/model_executor/parallel_utils/parallel_state.py

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,9 @@
33
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
44
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
55
"""Tensor and pipeline parallel groups."""
6-
import contextlib
76

87
import torch
98

10-
from vllm.model_executor.parallel_utils import cupy_utils
11-
129
# Tensor model parallel group that the current rank belongs to.
1310
_TENSOR_MODEL_PARALLEL_GROUP = None
1411
# Pipeline model parallel group that the current rank belongs to.
@@ -180,37 +177,3 @@ def destroy_model_parallel():
180177
_PIPELINE_MODEL_PARALLEL_GROUP = None
181178
global _PIPELINE_GLOBAL_RANKS
182179
_PIPELINE_GLOBAL_RANKS = None
183-
184-
# Destroy the cupy states if any.
185-
cupy_utils.destroy_process_group()
186-
187-
188-
# Whether to use cupy for nccl all reduce.
189-
# We use cupy for all reduce when using CUDA graph, because torch.distributed
190-
# is not well supported by CUDA graph.
191-
_ENABLE_CUPY_FOR_ALL_REDUCE = False
192-
193-
194-
@contextlib.contextmanager
195-
def with_custom_nccl_for_all_reduce():
196-
"""use custom nccl instead of torch.distributed for all reduce"""
197-
tp_size = get_tensor_model_parallel_world_size()
198-
if tp_size == 1:
199-
# No-op.
200-
# NOTE(woosuk): We don't initialize CuPy when tp_size is 1.
201-
yield
202-
else:
203-
global _ENABLE_CUPY_FOR_ALL_REDUCE
204-
old = _ENABLE_CUPY_FOR_ALL_REDUCE
205-
_ENABLE_CUPY_FOR_ALL_REDUCE = True
206-
207-
stream = torch.cuda.current_stream()
208-
with cupy_utils.set_cupy_stream(stream):
209-
yield
210-
_ENABLE_CUPY_FOR_ALL_REDUCE = old
211-
212-
213-
def is_custom_nccl_enabled_for_all_reduce():
214-
"""check if custom nccl is enabled for all reduce"""
215-
global _ENABLE_CUPY_FOR_ALL_REDUCE
216-
return _ENABLE_CUPY_FOR_ALL_REDUCE

vllm/worker/model_runner.py

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig
99
from vllm.logger import init_logger
1010
from vllm.model_executor import get_model, InputMetadata, SamplingMetadata
11-
from vllm.model_executor.parallel_utils.parallel_state import (
12-
with_custom_nccl_for_all_reduce)
1311
from vllm.sampling_params import SamplingParams, SamplingType
1412
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
1513

@@ -459,29 +457,25 @@ def capture(
459457
# Run the model once without capturing the graph.
460458
# This is to make sure that the captured graph does not include the
461459
# kernel launches for initial benchmarking (e.g., Triton autotune).
462-
with with_custom_nccl_for_all_reduce():
463-
self.model(
460+
self.model(
461+
input_ids,
462+
positions,
463+
kv_caches,
464+
input_metadata,
465+
)
466+
torch.cuda.synchronize()
467+
468+
# Capture the graph.
469+
self.graph = torch.cuda.CUDAGraph()
470+
with torch.cuda.graph(self.graph, pool=memory_pool):
471+
hidden_states = self.model(
464472
input_ids,
465473
positions,
466474
kv_caches,
467475
input_metadata,
468476
)
469477
torch.cuda.synchronize()
470478

471-
# Capture the graph.
472-
# NOTE(woosuk): Python 3.8 does not support multi-line with statements.
473-
# https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
474-
self.graph = torch.cuda.CUDAGraph()
475-
with torch.cuda.graph(self.graph, pool=memory_pool): # noqa: SIM117
476-
with with_custom_nccl_for_all_reduce():
477-
hidden_states = self.model(
478-
input_ids,
479-
positions,
480-
kv_caches,
481-
input_metadata,
482-
)
483-
torch.cuda.synchronize()
484-
485479
# Save the input and output buffers.
486480
self.input_buffers = {
487481
"input_ids": input_ids,

vllm/worker/worker.py

Lines changed: 4 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
99
SchedulerConfig)
1010
from vllm.model_executor import set_random_seed
11-
from vllm.model_executor.parallel_utils import cupy_utils
1211
from vllm.model_executor.parallel_utils.parallel_state import (
1312
initialize_model_parallel)
1413
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
@@ -47,7 +46,7 @@ def __init__(
4746
self.cache_events = None
4847
self.gpu_cache = None
4948

50-
def init_model(self, cupy_port: Optional[int] = None):
49+
def init_model(self) -> None:
5150
# torch.distributed.all_reduce does not free the input tensor until
5251
# the synchronization point. This causes the memory usage to grow
5352
# as the number of all_reduce calls increases. This env var disables
@@ -71,7 +70,7 @@ def init_model(self, cupy_port: Optional[int] = None):
7170

7271
# Initialize the distributed environment.
7372
_init_distributed_environment(self.parallel_config, self.rank,
74-
cupy_port, self.distributed_init_method)
73+
self.distributed_init_method)
7574

7675
# Initialize the model.
7776
set_random_seed(self.model_config.seed)
@@ -165,7 +164,6 @@ def execute_model(
165164
def _init_distributed_environment(
166165
parallel_config: ParallelConfig,
167166
rank: int,
168-
cupy_port: Optional[int],
169167
distributed_init_method: Optional[str] = None,
170168
) -> None:
171169
"""Initialize the distributed environment."""
@@ -188,29 +186,8 @@ def _init_distributed_environment(
188186
init_method=distributed_init_method,
189187
)
190188

191-
if cupy_utils.is_initialized():
192-
cupy_world_size = cupy_utils.get_world_size()
193-
if cupy_world_size != parallel_config.world_size:
194-
raise RuntimeError(
195-
"cupy.distributed is already initialized but the cupy world "
196-
"size does not match parallel_config.world_size "
197-
f"({cupy_world_size} vs. {parallel_config.world_size}).")
198-
elif parallel_config.world_size > 1:
199-
# NOTE(woosuk): We don't initialize CuPy process group when world size
200-
# is 1.
201-
# TODO(woosuk): Support multi-node connection.
202-
cupy_utils.init_process_group(
203-
world_size=parallel_config.world_size,
204-
rank=rank,
205-
host="localhost",
206-
port=cupy_port,
207-
)
208-
209-
if parallel_config.world_size > 1:
210-
# A small all_reduce for warmup.
211-
torch.distributed.all_reduce(torch.zeros(1).cuda())
212-
cupy_utils.all_reduce(torch.zeros(1).cuda())
213-
189+
# A small all_reduce for warmup.
190+
torch.distributed.all_reduce(torch.zeros(1).cuda())
214191
initialize_model_parallel(parallel_config.tensor_parallel_size,
215192
parallel_config.pipeline_parallel_size)
216193

0 commit comments

Comments
 (0)