diff --git a/pyproject.toml b/pyproject.toml index 117671aee..05577220b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -82,7 +82,7 @@ vision = [ "selenium", ] vllm = [ - "vllm", + "vllm>=0.10.2", "torch" ] all = [ diff --git a/src/smolagents/models.py b/src/smolagents/models.py index df9ebd2de..6da910954 100644 --- a/src/smolagents/models.py +++ b/src/smolagents/models.py @@ -630,6 +630,7 @@ def generate( **kwargs, ) -> ChatMessage: from vllm import SamplingParams # type: ignore + from vllm.sampling_params import StructuredOutputsParams # type: ignore completion_kwargs = self._prepare_completion_kwargs( messages=messages, @@ -639,7 +640,9 @@ def generate( **kwargs, ) # Override the OpenAI schema for VLLM compatibility - guided_options_request = {"guided_json": response_format["json_schema"]["schema"]} if response_format else None + structured_outputs = ( + StructuredOutputsParams(json=response_format["json_schema"]["schema"]) if response_format else None + ) messages = completion_kwargs.pop("messages") prepared_stop_sequences = completion_kwargs.pop("stop", []) @@ -658,12 +661,12 @@ def generate( temperature=kwargs.get("temperature", 0.0), max_tokens=kwargs.get("max_tokens", 2048), stop=prepared_stop_sequences, + structured_outputs=structured_outputs, ) out = self.model.generate( prompt, sampling_params=sampling_params, - guided_options_request=guided_options_request, **completion_kwargs, )