Implement tools and outputs for the VLLM model

RobinPicard · RobinPicard · commit b3f50326f57c · 2025-09-11T16:59:03.000+02:00
diff --git a/outlines/models/vllm.py b/outlines/models/vllm.py
@@ -1,11 +1,21 @@
 """Integration with a vLLM server."""
 
 import json
-from typing import TYPE_CHECKING, Any, AsyncIterator, Iterator, Optional, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    AsyncIterator,
+    Iterator,
+    List,
+    Optional,
+    Union,
+)
 
 from outlines.inputs import Chat
 from outlines.models.base import AsyncModel,Model, ModelTypeAdapter
 from outlines.models.openai import OpenAITypeAdapter
+from outlines.outputs import Output, StreamingOutput
+from outlines.tools import ToolDef
 from outlines.types.dsl import CFG, JsonSchema, python_types_to_terms, to_regex
 
 if TYPE_CHECKING:
@@ -36,7 +46,7 @@ def format_input(self, model_input: Union[Chat, str, list]) -> list:
         """
         return OpenAITypeAdapter().format_input(model_input)
 
-    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
+    def format_output_type(self, output_type: Optional[Any]) -> dict:
         """Generate the structured output argument to pass to the client.
 
         Parameters
@@ -64,6 +74,13 @@ def format_output_type(self, output_type: Optional[Any] = None) -> dict:
         else:
             return {"guided_regex": to_regex(term)}
 
+    def format_tools(self, tools):
+        """Not available for VLLM."""
+        if tools:
+            raise NotImplementedError(
+                "Tools are not available for VLLM."
+            )
+
 
 class VLLM(Model):
     """Thin wrapper around the `openai.OpenAI` client used to communicate with
@@ -93,9 +110,10 @@ def __init__(
     def generate(
         self,
         model_input: Union[Chat, str, list],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> Union[str, list[str]]:
+    ) -> Union[Output, list[Output]]:
         """Generate text using vLLM.
 
         Parameters
@@ -106,15 +124,18 @@ def generate(
             The desired format of the response generated by the model. All
             output types available in Outlines are supported provided your
             server uses a structured generation backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        Union[str, list[str]]
+        Union[Output, list[Output]]
             The text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         client_args = self._build_client_args(
             model_input,
             output_type,
@@ -132,24 +153,26 @@ def generate(
                 )
 
         if len(messages) == 1:
-            return messages[0].content
+            return Output(content=messages[0].content)
         else:
-            return [message.content for message in messages]
+            return [Output(content=message.content) for message in messages]
 
     def generate_batch(
         self,
         model_input,
-        output_type = None,
+        output_type,
+        tools,
         **inference_kwargs,
     ):
         raise NotImplementedError("VLLM does not support batch inference.")
 
     def generate_stream(
         self,
         model_input: Union[Chat, str, list],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> Iterator[str]:
+    ) -> Iterator[StreamingOutput]:
         """Stream text using vLLM.
 
         Parameters
@@ -160,15 +183,18 @@ def generate_stream(
             The desired format of the response generated by the model. All
             output types available in Outlines are supported provided your
             server uses a structured generation backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        Iterator[str]
+        Iterator[StreamingOutput]
             An iterator that yields the text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         client_args = self._build_client_args(
             model_input, output_type, **inference_kwargs,
         )
@@ -179,12 +205,12 @@ def generate_stream(
 
         for chunk in stream:  # pragma: no cover
             if chunk.choices and chunk.choices[0].delta.content is not None:
-                yield chunk.choices[0].delta.content
+                yield StreamingOutput(content=chunk.choices[0].delta.content)
 
     def _build_client_args(
         self,
         model_input: Union[Chat, str, list],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
         **inference_kwargs: Any,
     ) -> dict:
         """Build the arguments to pass to the OpenAI client."""
@@ -234,9 +260,10 @@ def __init__(
     async def generate(
         self,
         model_input: Union[Chat, str, list],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> Union[str, list[str]]:
+    ) -> Union[Output, list[Output]]:
         """Generate text using vLLM.
 
         Parameters
@@ -247,12 +274,14 @@ async def generate(
             The desired format of the response generated by the model. All
             output types available in Outlines are supported provided your
             server uses a structured generation backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        Union[str, list[str]]
+        Union[Output, list[Output]]
             The text generated by the model.
 
         """
@@ -271,24 +300,26 @@ async def generate(
                 )
 
         if len(messages) == 1:
-            return messages[0].content
+            return Output(content=messages[0].content)
         else:
-            return [message.content for message in messages]
+            return [Output(content=message.content) for message in messages]
 
     async def generate_batch(
         self,
         model_input,
-        output_type = None,
+        output_type,
+        tools,
         **inference_kwargs,
     ):
         raise NotImplementedError("VLLM does not support batch inference.")
 
     async def generate_stream( # type: ignore
         self,
         model_input: Union[Chat, str, list],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> AsyncIterator[str]:
+    ) -> AsyncIterator[StreamingOutput]:
         """Stream text using vLLM.
 
         Parameters
@@ -299,13 +330,16 @@ async def generate_stream( # type: ignore
             The desired format of the response generated by the model. All
             output types available in Outlines are supported provided your
             server uses a structured generation backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        AsyncIterator[str]
+        AsyncIterator[StreamingOutput]
             An async iterator that yields the text generated by the model.
+
         """
         client_args = self._build_client_args(
             model_input, output_type, **inference_kwargs,
@@ -318,12 +352,12 @@ async def generate_stream( # type: ignore
 
         async for chunk in stream:  # pragma: no cover
             if chunk.choices and chunk.choices[0].delta.content is not None:
-                yield chunk.choices[0].delta.content
+                yield StreamingOutput(content=chunk.choices[0].delta.content)
 
     def _build_client_args(
         self,
         model_input: Union[Chat, str, list],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
         **inference_kwargs: Any,
     ) -> dict:
         """Build the arguments to pass to the OpenAI client."""
diff --git a/tests/models/test_vllm.py b/tests/models/test_vllm.py
@@ -11,6 +11,7 @@
 
 from outlines.inputs import Chat, Image
 from outlines.models.vllm import VLLM, AsyncVLLM, from_vllm
+from outlines.outputs import Output, StreamingOutput
 from outlines.types.dsl import CFG, Regex, JsonSchema
 from tests.test_utils.mock_openai_client import MockOpenAIClient, MockAsyncOpenAIClient
 
@@ -225,7 +226,7 @@ def test_vllm_init():
 
 def test_vllm_sync_simple_call(sync_model):
     result = sync_model("Respond with a single word.",)
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 def test_vllm_sync_streaming(sync_model_no_model_name):
@@ -234,7 +235,7 @@ def test_vllm_sync_streaming(sync_model_no_model_name):
         model=vllm_model_name,
     )
     assert isinstance(result, Generator)
-    assert isinstance(next(result), str)
+    assert isinstance(next(result), StreamingOutput)
 
 
 def test_vllm_sync_batch(sync_model):
@@ -246,7 +247,7 @@ def test_vllm_sync_batch(sync_model):
 
 def test_vllm_sync_vision(sync_model):
     result = sync_model(["hello", image_input], max_tokens=10)
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 def test_vllm_sync_vision_chat(sync_model):
@@ -261,40 +262,40 @@ def test_vllm_sync_vision_chat(sync_model):
         ]),
         max_tokens=10,
     )
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 def test_vllm_sync_multiple_samples(sync_model):
     result = sync_model("Respond with a single word.", n=2)
     assert isinstance(result, list)
     assert len(result) == 2
-    assert isinstance(result[0], str)
-    assert isinstance(result[1], str)
+    assert isinstance(result[0], Output)
+    assert isinstance(result[1], Output)
 
 
 def test_vllm_sync_json(sync_model):
     json_string = '{"type": "object", "properties": {"bar": {"type": "string"}}}'
     result = sync_model("foo?", JsonSchema(json_string), max_tokens=10)
-    assert isinstance(result, str)
-    assert "bar" in result
+    assert isinstance(result, Output)
+    assert "bar" in result.content
 
 
 def test_vllm_sync_regex(sync_model):
     result = sync_model("foo?", Regex(r"[0-9]{3}"), max_tokens=10)
-    assert isinstance(result, str)
-    assert re.match(r"[0-9]{3}", result)
+    assert isinstance(result, Output)
+    assert re.match(r"[0-9]{3}", result.content)
 
 
 def test_vllm_sync_cfg(sync_model):
     result = sync_model("foo?", CFG(YES_NO_GRAMMAR), max_tokens=10)
-    assert isinstance(result, str)
-    assert result in ["yes", "no"]
+    assert isinstance(result, Output)
+    assert result.content in ["yes", "no"]
 
 
 @pytest.mark.asyncio
 async def test_vllm_async_simple_call(async_model):
     result = await async_model("Respond with a single word.",)
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 @pytest.mark.asyncio
@@ -305,7 +306,7 @@ async def test_vllm_async_streaming(async_model_no_model_name):
     )
     assert isinstance(result, AsyncGenerator)
     async for chunk in result:
-        assert isinstance(chunk, str)
+        assert isinstance(chunk, StreamingOutput)
         break  # Just check the first chunk
 
 
@@ -320,7 +321,7 @@ async def test_vllm_async_batch(async_model):
 @pytest.mark.asyncio
 async def test_vllm_async_vision(async_model):
     result = await async_model(["hello", image_input], max_tokens=10)
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 @pytest.mark.asyncio
@@ -336,35 +337,35 @@ async def test_vllm_async_vision_chat(async_model):
         ]),
         max_tokens=10,
     )
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 @pytest.mark.asyncio
 async def test_vllm_async_multiple_samples(async_model):
     result = await async_model("Respond with a single word.", n=2)
     assert isinstance(result, list)
     assert len(result) == 2
-    assert isinstance(result[0], str)
-    assert isinstance(result[1], str)
+    assert isinstance(result[0], Output)
+    assert isinstance(result[1], Output)
 
 
 @pytest.mark.asyncio
 async def test_vllm_async_json(async_model):
     json_string = '{"type": "object", "properties": {"bar": {"type": "string"}}}'
     result = await async_model("foo?", JsonSchema(json_string), max_tokens=10)
-    assert isinstance(result, str)
-    assert "bar" in result
+    assert isinstance(result, Output)
+    assert "bar" in result.content
 
 
 @pytest.mark.asyncio
 async def test_vllm_async_regex(async_model):
     result = await async_model("foo?", Regex(r"[0-9]{3}"), max_tokens=10)
-    assert isinstance(result, str)
-    assert re.match(r"[0-9]{3}", result)
+    assert isinstance(result, Output)
+    assert re.match(r"[0-9]{3}", result.content)
 
 
 @pytest.mark.asyncio
 async def test_vllm_async_cfg(async_model):
     result = await async_model("foo?", CFG(YES_NO_GRAMMAR), max_tokens=10)
-    assert isinstance(result, str)
-    assert result in ["yes", "no"]
+    assert isinstance(result, Output)
+    assert result.content in ["yes", "no"]
diff --git a/tests/models/test_vllm_type_adapter.py b/tests/models/test_vllm_type_adapter.py