@@ -111,6 +111,7 @@ def __init__(
111111 self ._config = config
112112 self .use_chat_template = config .use_chat_template
113113 self .data_parallel_size = int (config .data_parallel_size )
114+ self .tensor_parallel_size = int (config .tensor_parallel_size )
114115
115116 self ._add_special_tokens = config .add_special_tokens if config .add_special_tokens is not None else False
116117 self ._tokenizer = self ._create_auto_tokenizer (config , env_config )
@@ -184,7 +185,7 @@ def _create_auto_model(self, config: VLLMModelConfig, env_config: EnvConfig) ->
184185 "seed" : 1234 ,
185186 }
186187 if int (config .data_parallel_size ) > 1 :
187- self .model_args ["worker_use_ray " ] = True
188+ self .model_args ["distributed_executor_backend " ] = "ray"
188189 self ._batch_size = "auto"
189190 return None
190191
@@ -331,7 +332,9 @@ def _generate(
331332 # see https://github.com/vllm-project/vllm/issues/973
332333 # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
333334 # but then tensor_parallel breaks
334- @ray .remote
335+ # Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set,
336+ # as VLLM complains about no GPUs available.
337+ @ray .remote (num_gpus = 1 if self .tensor_parallel_size == 1 else None )
335338 def run_inference_one_model (model_args : dict , sampling_params : SamplingParams , requests ):
336339 llm = LLM (** model_args )
337340 return llm .generate (prompt_token_ids = requests , sampling_params = sampling_params )
0 commit comments