From 7802270c7dbc46e2c566968c458148847f3590b1 Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 22 Oct 2025 12:49:56 -0400 Subject: [PATCH 1/3] Add suppoort for vllm 0.11.0 --- pyproject.toml | 2 +- src/lighteval/models/vllm/vllm_model.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 45b88d1f2..2cab58f73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,7 +98,7 @@ nanotron = [ "tensorboardX" ] tensorboardX = ["tensorboardX"] -vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"] +vllm = ["vllm>=0.10.0,<0.11.0", "ray", "more_itertools"] sglang = ["sglang"] quality = ["ruff>=v0.11.0","pre-commit"] tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"] diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 969caf8fa..75324002d 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -40,7 +40,6 @@ from lighteval.utils.cache_management import SampleCache, cached from lighteval.utils.imports import is_package_available, requires - logger = logging.getLogger(__name__) @@ -52,6 +51,7 @@ destroy_distributed_environment, destroy_model_parallel, ) + from vllm.inputs import token_inputs from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM @@ -437,6 +437,7 @@ def _generate( @ray.remote(num_gpus=self.tensor_parallel_size) def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): llm = LLM(**model_args) + requests = [token_inputs(prompt_token_ids=request) for request in requests] return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params) # dispatch requests to all self.data_parallel_size workers, in interleaved fashion @@ -454,6 +455,7 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r if x is not None ] else: + inputs = [token_inputs(prompt_token_ids=input) for input in inputs] outputs = self.model.generate( prompt_token_ids=inputs, sampling_params=sampling_params, From 46a7b2b80084b4d9632d4b4459bd11ab8d193c8f Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 22 Oct 2025 12:50:09 -0400 Subject: [PATCH 2/3] Update vllm version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2cab58f73..7835d613f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,7 +98,7 @@ nanotron = [ "tensorboardX" ] tensorboardX = ["tensorboardX"] -vllm = ["vllm>=0.10.0,<0.11.0", "ray", "more_itertools"] +vllm = ["vllm>=0.10.0,<0.12.0", "ray", "more_itertools"] sglang = ["sglang"] quality = ["ruff>=v0.11.0","pre-commit"] tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"] From 757ebaa929390eebd0a29de59489bef5922e13dd Mon Sep 17 00:00:00 2001 From: Alexandre Marques Date: Wed, 22 Oct 2025 13:35:38 -0400 Subject: [PATCH 3/3] Refactor parameter naming in VLLMModel's generate method for consistency --- src/lighteval/models/vllm/vllm_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 75324002d..005847f4f 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -438,7 +438,7 @@ def _generate( def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): llm = LLM(**model_args) requests = [token_inputs(prompt_token_ids=request) for request in requests] - return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params) + return llm.generate(prompts=requests, sampling_params=sampling_params) # dispatch requests to all self.data_parallel_size workers, in interleaved fashion # interleaved important to balance context lengths across workers @@ -457,7 +457,7 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r else: inputs = [token_inputs(prompt_token_ids=input) for input in inputs] outputs = self.model.generate( - prompt_token_ids=inputs, + prompts=inputs, sampling_params=sampling_params, use_tqdm=True, )