8
8
from vllm .config import (CacheConfig , ModelConfig , ParallelConfig ,
9
9
SchedulerConfig )
10
10
from vllm .model_executor import set_random_seed
11
- from vllm .model_executor .parallel_utils import cupy_utils
12
11
from vllm .model_executor .parallel_utils .parallel_state import (
13
12
initialize_model_parallel )
14
13
from vllm .sequence import SamplerOutput , SequenceGroupMetadata
@@ -47,7 +46,7 @@ def __init__(
47
46
self .cache_events = None
48
47
self .gpu_cache = None
49
48
50
- def init_model (self , cupy_port : Optional [ int ] = None ) :
49
+ def init_model (self ) -> None :
51
50
# torch.distributed.all_reduce does not free the input tensor until
52
51
# the synchronization point. This causes the memory usage to grow
53
52
# as the number of all_reduce calls increases. This env var disables
@@ -71,7 +70,7 @@ def init_model(self, cupy_port: Optional[int] = None):
71
70
72
71
# Initialize the distributed environment.
73
72
_init_distributed_environment (self .parallel_config , self .rank ,
74
- cupy_port , self .distributed_init_method )
73
+ self .distributed_init_method )
75
74
76
75
# Initialize the model.
77
76
set_random_seed (self .model_config .seed )
@@ -165,7 +164,6 @@ def execute_model(
165
164
def _init_distributed_environment (
166
165
parallel_config : ParallelConfig ,
167
166
rank : int ,
168
- cupy_port : Optional [int ],
169
167
distributed_init_method : Optional [str ] = None ,
170
168
) -> None :
171
169
"""Initialize the distributed environment."""
@@ -188,29 +186,8 @@ def _init_distributed_environment(
188
186
init_method = distributed_init_method ,
189
187
)
190
188
191
- if cupy_utils .is_initialized ():
192
- cupy_world_size = cupy_utils .get_world_size ()
193
- if cupy_world_size != parallel_config .world_size :
194
- raise RuntimeError (
195
- "cupy.distributed is already initialized but the cupy world "
196
- "size does not match parallel_config.world_size "
197
- f"({ cupy_world_size } vs. { parallel_config .world_size } )." )
198
- elif parallel_config .world_size > 1 :
199
- # NOTE(woosuk): We don't initialize CuPy process group when world size
200
- # is 1.
201
- # TODO(woosuk): Support multi-node connection.
202
- cupy_utils .init_process_group (
203
- world_size = parallel_config .world_size ,
204
- rank = rank ,
205
- host = "localhost" ,
206
- port = cupy_port ,
207
- )
208
-
209
- if parallel_config .world_size > 1 :
210
- # A small all_reduce for warmup.
211
- torch .distributed .all_reduce (torch .zeros (1 ).cuda ())
212
- cupy_utils .all_reduce (torch .zeros (1 ).cuda ())
213
-
189
+ # A small all_reduce for warmup.
190
+ torch .distributed .all_reduce (torch .zeros (1 ).cuda ())
214
191
initialize_model_parallel (parallel_config .tensor_parallel_size ,
215
192
parallel_config .pipeline_parallel_size )
216
193
0 commit comments