@@ -111,6 +111,7 @@ def __init__(
111
111
self ._config = config
112
112
self .use_chat_template = config .use_chat_template
113
113
self .data_parallel_size = int (config .data_parallel_size )
114
+ self .tensor_parallel_size = int (config .tensor_parallel_size )
114
115
115
116
self ._add_special_tokens = config .add_special_tokens if config .add_special_tokens is not None else False
116
117
self ._tokenizer = self ._create_auto_tokenizer (config , env_config )
@@ -184,7 +185,7 @@ def _create_auto_model(self, config: VLLMModelConfig, env_config: EnvConfig) ->
184
185
"seed" : 1234 ,
185
186
}
186
187
if int (config .data_parallel_size ) > 1 :
187
- self .model_args ["worker_use_ray " ] = True
188
+ self .model_args ["distributed_executor_backend " ] = "ray"
188
189
self ._batch_size = "auto"
189
190
return None
190
191
@@ -331,7 +332,9 @@ def _generate(
331
332
# see https://github.com/vllm-project/vllm/issues/973
332
333
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
333
334
# but then tensor_parallel breaks
334
- @ray .remote
335
+ # Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set,
336
+ # as VLLM complains about no GPUs available.
337
+ @ray .remote (num_gpus = 1 if self .tensor_parallel_size == 1 else None )
335
338
def run_inference_one_model (model_args : dict , sampling_params : SamplingParams , requests ):
336
339
llm = LLM (** model_args )
337
340
return llm .generate (prompt_token_ids = requests , sampling_params = sampling_params )
0 commit comments