diff --git a/pyproject.toml b/pyproject.toml index 45b88d1f2..7835d613f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,7 +98,7 @@ nanotron = [ "tensorboardX" ] tensorboardX = ["tensorboardX"] -vllm = ["vllm>=0.10.0,<0.10.2", "ray", "more_itertools"] +vllm = ["vllm>=0.10.0,<0.12.0", "ray", "more_itertools"] sglang = ["sglang"] quality = ["ruff>=v0.11.0","pre-commit"] tests = ["pytest>=7.4.0","deepdiff","pip>=25.2"] diff --git a/src/lighteval/models/vllm/vllm_model.py b/src/lighteval/models/vllm/vllm_model.py index 969caf8fa..005847f4f 100644 --- a/src/lighteval/models/vllm/vllm_model.py +++ b/src/lighteval/models/vllm/vllm_model.py @@ -40,7 +40,6 @@ from lighteval.utils.cache_management import SampleCache, cached from lighteval.utils.imports import is_package_available, requires - logger = logging.getLogger(__name__) @@ -52,6 +51,7 @@ destroy_distributed_environment, destroy_model_parallel, ) + from vllm.inputs import token_inputs from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM @@ -437,7 +437,8 @@ def _generate( @ray.remote(num_gpus=self.tensor_parallel_size) def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, requests): llm = LLM(**model_args) - return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params) + requests = [token_inputs(prompt_token_ids=request) for request in requests] + return llm.generate(prompts=requests, sampling_params=sampling_params) # dispatch requests to all self.data_parallel_size workers, in interleaved fashion # interleaved important to balance context lengths across workers @@ -454,8 +455,9 @@ def run_inference_one_model(model_args: dict, sampling_params: SamplingParams, r if x is not None ] else: + inputs = [token_inputs(prompt_token_ids=input) for input in inputs] outputs = self.model.generate( - prompt_token_ids=inputs, + prompts=inputs, sampling_params=sampling_params, use_tqdm=True, )