Implement tools and outputs for the VLLMOffline model

RobinPicard · RobinPicard · commit a6b231fc4442 · 2025-09-11T16:59:03.000+02:00
diff --git a/outlines/models/vllm_offline.py b/outlines/models/vllm_offline.py
@@ -7,6 +7,8 @@
 from outlines.inputs import Chat
 from outlines.models.base import Model, ModelTypeAdapter
 from outlines.models.openai import OpenAITypeAdapter
+from outlines.outputs import Output
+from outlines.tools import ToolDef
 from outlines.types.dsl import CFG, JsonSchema, python_types_to_terms, to_regex
 
 if TYPE_CHECKING:
@@ -56,7 +58,7 @@ def format_input_chat(self, model_input: Chat) -> list:
                 )
         return OpenAITypeAdapter().format_input(model_input)
 
-    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
+    def format_output_type(self, output_type: Optional[Any]) -> dict:
         """Generate the structured output argument to pass to the model.
 
         For vLLM, the structured output definition is set in the
@@ -90,6 +92,13 @@ def format_output_type(self, output_type: Optional[Any] = None) -> dict:
         else:
             return {"regex": to_regex(term)}
 
+    def format_tools(self, tools):
+        """Not available for VLLM offline."""
+        if tools:
+            raise NotImplementedError(
+                "Tools are not available for VLLM offline."
+            )
+
 
 class VLLMOffline(Model):
     """Thin wrapper around a `vllm.LLM` model.
@@ -114,7 +123,7 @@ def __init__(self, model: "LLM"):
     def _build_generation_args(
         self,
         inference_kwargs: dict,
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
     ) -> "SamplingParams":
         """Create the `SamplingParams` object to pass to the `generate` method
         of the `vllm.LLM` model."""
@@ -134,9 +143,10 @@ def _build_generation_args(
     def generate(
         self,
         model_input: Chat | str,
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> Union[str, List[str]]:
+    ) -> Union[Output, List[Output]]:
         """Generate text using vLLM offline.
 
         Parameters
@@ -146,16 +156,19 @@ def generate(
         output_type
             The logits processor the model will use to constrain the format of
             the generated text.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the `generate` method
             in the `vllm.LLM` model.
 
         Returns
         -------
-        Union[str, List[str]]
+        Union[Output, List[Output]]
             The text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         sampling_params = self._build_generation_args(
             inference_kwargs,
             output_type,
@@ -168,24 +181,25 @@ def generate(
                 **inference_kwargs,
             )
         else:
-            results = self.model.generate(
+            results = self.model(
                 prompts=self.type_adapter.format_input(model_input),
                 sampling_params=sampling_params,
                 **inference_kwargs,
             )
         results = [completion.text for completion in results[0].outputs]
 
         if len(results) == 1:
-            return results[0]
+            return Output(content=results[0])
         else:
-            return results
+            return [Output(content=result) for result in results]
 
     def generate_batch(
         self,
         model_input: List[Chat | str],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> Union[List[str], List[List[str]]]:
+    ) -> Union[List[Output], List[List[Output]]]:
         """Generate a batch of completions using vLLM offline.
 
         Parameters
@@ -196,16 +210,19 @@ def generate_batch(
         output_type
             The logits processor the model will use to constrain the format of
             the generated text.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the `generate` method
             in the `vllm.LLM` model.
 
         Returns
         -------
-        Union[List[str], List[List[str]]]
+        Union[List[Output], List[List[Output]]]
             The text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         sampling_params = self._build_generation_args(
             inference_kwargs,
             output_type,
@@ -216,14 +233,20 @@ def generate_batch(
                 "Batch generation is not available for the `Chat` input type."
             )
 
-        results = self.model.generate(
+        results = self.model(
             prompts=[self.type_adapter.format_input(item) for item in model_input],
             sampling_params=sampling_params,
             **inference_kwargs,
         )
-        return [[sample.text for sample in batch.outputs] for batch in results]
 
-    def generate_stream(self, model_input, output_type, **inference_kwargs):
+        return [ # type: ignore
+            [Output(content=sample.text) for sample in batch.outputs]
+            if len(batch.outputs) > 1
+            else Output(content=batch.outputs[0].text)
+            for batch in results
+        ]
+
+    def generate_stream(self, model_input, output_type, tools, **inference_kwargs):
         """Not available for `vllm.LLM`.
 
         TODO: Implement the streaming functionality ourselves.
diff --git a/tests/models/test_vllm_offline.py b/tests/models/test_vllm_offline.py
@@ -19,6 +19,7 @@
     VLLMOfflineTypeAdapter,
     from_vllm_offline
 )
+from outlines.outputs import Output, StreamingOutput
 from outlines.types import Regex
 
 
@@ -58,13 +59,13 @@ def model(tmp_path_factory):
 
 
 def test_vllm_simple(model):
-    result = model.generate("Respond with one word. Not more.", None)
-    assert isinstance(result, str)
+    result = model("Respond with one word. Not more.", None)
+    assert isinstance(result, Output)
 
 
 def test_vllm_call(model):
     result = model("Respond with one word. Not more.")
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 def test_vllm_inference_kwargs(model):
@@ -73,8 +74,8 @@ def test_vllm_inference_kwargs(model):
         sampling_params=SamplingParams(max_tokens=2),
         use_tqdm=True
     )
-    assert isinstance(result, str)
-    assert len(result) <= 20
+    assert isinstance(result, Output)
+    assert len(result.content) <= 20
 
 
 def test_vllm_chat(model):
@@ -86,7 +87,7 @@ def test_vllm_chat(model):
         ]),
         sampling_params=SamplingParams(max_tokens=2),
     )
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 def test_vllm_invalid_inference_kwargs(model):
@@ -96,16 +97,16 @@ def test_vllm_invalid_inference_kwargs(model):
 
 def test_vllm_regex(model):
     result = model("Give a number between 0 and 9.", Regex(r"[0-9]"))
-    assert isinstance(result, str)
-    assert re.match(r"[0-9]", result)
+    assert isinstance(result, Output)
+    assert re.match(r"[0-9]", result.content)
 
 
 def test_vllm_json(model):
     class Character(BaseModel):
         name: str
 
     result = model("Create a character with a name.", Character)
-    assert "name" in result
+    assert "name" in result.content
 
 
 def test_vllm_choice(model):
@@ -114,7 +115,7 @@ class Foo(Enum):
         dog = "dog"
 
     result = model("Cat or dog?", Foo)
-    assert result in ["cat", "dog"]
+    assert result.content in ["cat", "dog"]
 
 
 def test_vllm_multiple_samples(model):
diff --git a/tests/models/test_vllm_offline_type_adapter.py b/tests/models/test_vllm_offline_type_adapter.py
@@ -6,6 +6,7 @@
 
 from outlines.inputs import Chat, Image
 from outlines.models.vllm_offline import VLLMOfflineTypeAdapter
+from outlines.tools import ToolDef
 from outlines.types import CFG, JsonSchema, Regex
 
 
@@ -113,3 +114,15 @@ def test_vllm_offline_type_adapter_output_type(
     assert type_adapter.format_output_type(regex_instance) == {
         "regex": "([0-9]+)"
     }
+
+
+def test_vllm_offline_type_adapter_tools(type_adapter):
+    with pytest.raises(
+        NotImplementedError,
+        match="Tools are not available for VLLM offline."
+    ):
+        type_adapter.format_tools(
+            [ToolDef(name="test", description="test", parameters={})]
+        )
+
+    type_adapter.format_tools(None)