Skip to content

Commit 86f6225

Browse files
hynky1999Hynek Kydlicek
andauthored
Fix VLLM data-parallel (#541)
* make bleur lazy * make tokenizer lazy too * fix_ray * fix tensor_paralel > 1 * remove debug statements * bump vllm --------- Co-authored-by: Hynek Kydlicek <[email protected]>
1 parent 3c9b0c9 commit 86f6225

File tree

2 files changed

+6
-3
lines changed

2 files changed

+6
-3
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ nanotron = [
9292
"tensorboardX"
9393
]
9494
tensorboardX = ["tensorboardX"]
95-
vllm = ["vllm", "ray", "more_itertools"]
95+
vllm = ["vllm>=0.7.0", "ray", "more_itertools"]
9696
quality = ["ruff==v0.2.2","pre-commit"]
9797
tests = ["pytest==7.4.0"]
9898
dev = ["lighteval[accelerate,quality,tests,multilingual,math]"]

src/lighteval/models/vllm/vllm_model.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,7 @@ def __init__(
111111
self._config = config
112112
self.use_chat_template = config.use_chat_template
113113
self.data_parallel_size = int(config.data_parallel_size)
114+
self.tensor_parallel_size = int(config.tensor_parallel_size)
114115

115116
self._add_special_tokens = config.add_special_tokens if config.add_special_tokens is not None else False
116117
self._tokenizer = self._create_auto_tokenizer(config, env_config)
@@ -184,7 +185,7 @@ def _create_auto_model(self, config: VLLMModelConfig, env_config: EnvConfig) ->
184185
"seed": 1234,
185186
}
186187
if int(config.data_parallel_size) > 1:
187-
self.model_args["worker_use_ray"] = True
188+
self.model_args["distributed_executor_backend"] = "ray"
188189
self._batch_size = "auto"
189190
return None
190191

@@ -331,7 +332,9 @@ def _generate(
331332
# see https://github.com/vllm-project/vllm/issues/973
332333
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
333334
# but then tensor_parallel breaks
334-
@ray.remote
335+
# Hynek: With the newest vllm, it actually breaks when tensor_parallel_size == 1 and num_gpus not set,
336+
# as VLLM complains about no GPUs available.
337+
@ray.remote(num_gpus=1 if self.tensor_parallel_size == 1 else None)
335338
def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests):
336339
llm = LLM(**model_args)
337340
return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)

0 commit comments

Comments
 (0)