Implement tools and outputs for the TGI model

RobinPicard · RobinPicard · commit 9955e1efa0bd · 2025-09-11T16:59:03.000+02:00
diff --git a/outlines/models/tgi.py b/outlines/models/tgi.py
@@ -7,11 +7,14 @@
     Any,
     AsyncIterator,
     Iterator,
+    List,
     Optional,
     Union,
 )
 
-from outlines.models.base import AsyncModel,Model, ModelTypeAdapter
+from outlines.models.base import AsyncModel, Model, ModelTypeAdapter
+from outlines.outputs import Output, StreamingOutput
+from outlines.tools import ToolDef
 from outlines.types.dsl import python_types_to_terms, to_regex, JsonSchema, CFG
 
 if TYPE_CHECKING:
@@ -47,7 +50,7 @@ def format_input(self, model_input):
     def format_str_input(self, model_input: str) -> str:
         return model_input
 
-    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
+    def format_output_type(self, output_type: Optional[Any]) -> dict:
         """Generate the structured output argument to pass to the client.
 
         Argument
@@ -84,6 +87,13 @@ def format_output_type(self, output_type: Optional[Any] = None) -> dict:
                 }
             }
 
+    def format_tools(self, tools):
+        """Not available for TGI."""
+        if tools:
+            raise NotImplementedError(
+                "Tools are not available for TGI."
+            )
+
 
 class TGI(Model):
     """Thin wrapper around a `huggingface_hub.InferenceClient` client used to
@@ -109,9 +119,10 @@ def __init__(self, client):
     def generate(
         self,
         model_input: str,
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> str:
+    ) -> Output:
         """Generate text using TGI.
 
         Parameters
@@ -122,37 +133,44 @@ def generate(
             The desired format of the response generated by the model. All
             output types except `CFG` are supported provided your server uses
             a backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        str
+        Output
             The text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         client_args = self._build_client_args(
             model_input,
             output_type,
             **inference_kwargs,
         )
 
-        return self.client.text_generation(**client_args)
+        response = self.client.text_generation(**client_args)
+
+        return Output(content=response)
 
     def generate_batch(
         self,
         model_input,
-        output_type = None,
+        output_type,
+        tools,
         **inference_kwargs,
     ):
         raise NotImplementedError("TGI does not support batch inference.")
 
     def generate_stream(
         self,
         model_input: str,
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> Iterator[str]:
+    ) -> Iterator[StreamingOutput]:
         """Stream text using TGI.
 
         Parameters
@@ -163,15 +181,18 @@ def generate_stream(
             The desired format of the response generated by the model. All
             output types except `CFG` are supported provided your server uses
             a backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        Iterator[str]
+        Iterator[StreamingOutput]
             An iterator that yields the text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         client_args = self._build_client_args(
             model_input, output_type, **inference_kwargs,
         )
@@ -181,12 +202,12 @@ def generate_stream(
         )
 
         for chunk in stream:  # pragma: no cover
-            yield chunk
+            yield StreamingOutput(content=chunk)
 
     def _build_client_args(
         self,
         model_input: str,
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
         **inference_kwargs: Any,
     ) -> dict:
         """Build the arguments to pass to the TGI client."""
@@ -226,9 +247,10 @@ def __init__(self, client):
     async def generate(
         self,
         model_input: str,
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> str:
+    ) -> Output:
         """Generate text using TGI.
 
         Parameters
@@ -239,37 +261,42 @@ async def generate(
             The desired format of the response generated by the model. All
             output types except `CFG` are supported provided your server uses
             a backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        str
+        Output
             The text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         client_args = self._build_client_args(
             model_input, output_type, **inference_kwargs,
         )
 
         response = await self.client.text_generation(**client_args)
 
-        return response
+        return Output(content=response)
 
     async def generate_batch(
         self,
         model_input,
-        output_type = None,
+        output_type,
+        tools,
         **inference_kwargs,
     ):
         raise NotImplementedError("TGI does not support batch inference.")
 
     async def generate_stream( # type: ignore
         self,
         model_input: str,
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> AsyncIterator[str]:
+    ) -> AsyncIterator[StreamingOutput]:
         """Stream text using TGI.
 
         Parameters
@@ -280,15 +307,18 @@ async def generate_stream( # type: ignore
             The desired format of the response generated by the model. All
             output types except `CFG` are supported provided your server uses
             a backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        AsyncIterator[str]
+        AsyncIterator[StreamingOutput]
             An async iterator that yields the text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         client_args = self._build_client_args(
             model_input, output_type, **inference_kwargs,
         )
@@ -298,12 +328,12 @@ async def generate_stream( # type: ignore
         )
 
         async for chunk in stream:  # pragma: no cover
-            yield chunk
+            yield StreamingOutput(content=chunk)
 
     def _build_client_args(
         self,
         model_input: str,
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
         **inference_kwargs: Any,
     ) -> dict:
         """Build the arguments to pass to the TGI client."""
diff --git a/tests/models/test_tgi.py b/tests/models/test_tgi.py
@@ -7,6 +7,7 @@
 from huggingface_hub import InferenceClient, AsyncInferenceClient
 
 from outlines.models.tgi import TGI, AsyncTGI, from_tgi
+from outlines.outputs import Output, StreamingOutput
 from outlines.types.dsl import CFG, Regex, JsonSchema
 from tests.test_utils.mock_tgi_client import MockTGIInferenceClient, MockAsyncTGIInferenceClient
 
@@ -42,7 +43,7 @@
             'max_new_tokens': 10,
             'stream': True
         },
-        ["foo", "bar"]
+        [Output(content="foo"), Output(content="bar")]
     ),
     (
         {
@@ -108,7 +109,7 @@ def test_tgi_init():
 
 def test_tgi_sync_simple_call(sync_model):
     result = sync_model("Respond with a single word.", max_new_tokens=10)
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 def test_tgi_sync_streaming(sync_model):
@@ -117,7 +118,7 @@ def test_tgi_sync_streaming(sync_model):
         max_new_tokens=10,
     )
     assert isinstance(result, Generator)
-    assert isinstance(next(result), str)
+    assert isinstance(next(result), StreamingOutput)
 
 
 def test_tgi_sync_batch(sync_model):
@@ -130,14 +131,14 @@ def test_tgi_sync_batch(sync_model):
 def test_tgi_sync_json(sync_model):
     json_string = '{"type": "object", "properties": {"bar": {"type": "string"}}, "required": ["bar"]}'
     result = sync_model("foo?", JsonSchema(json_string), max_new_tokens=10)
-    assert isinstance(result, str)
-    assert "bar" in result
+    assert isinstance(result, Output)
+    assert "bar" in result.content
 
 
 def test_tgi_sync_regex(sync_model):
     result = sync_model("foo?", Regex(r"[0-9]{3}"), max_new_tokens=10)
-    assert isinstance(result, str)
-    assert re.match(r"[0-9]{3}", result)
+    assert isinstance(result, Output)
+    assert re.match(r"[0-9]{3}", result.content)
 
 
 def test_tgi_sync_cfg(sync_model):
@@ -151,15 +152,15 @@ def test_tgi_sync_cfg(sync_model):
 @pytest.mark.asyncio
 async def test_tgi_async_simple_call(async_model):
     result = await async_model("Respond with a single word.", max_new_tokens=10)
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 @pytest.mark.asyncio
 async def test_tgi_async_streaming(async_model):
     result = async_model.stream("Respond with a single word.", max_new_tokens=10)
     assert isinstance(result, AsyncGenerator)
     async for chunk in result:
-        assert isinstance(chunk, str)
+        assert isinstance(chunk, StreamingOutput)
         break  # Just check the first chunk
 
 
@@ -175,15 +176,15 @@ async def test_tgi_async_batch(async_model):
 async def test_tgi_async_json(async_model):
     json_string = '{"type": "object", "properties": {"bar": {"type": "string"}}, "required": ["bar"]}'
     result = await async_model("foo?", JsonSchema(json_string), max_new_tokens=10)
-    assert isinstance(result, str)
-    assert "bar" in result
+    assert isinstance(result, Output)
+    assert "bar" in result.content
 
 
 @pytest.mark.asyncio
 async def test_tgi_async_regex(async_model):
     result = await async_model("foo?", Regex(r"[0-9]{3}"), max_new_tokens=10)
-    assert isinstance(result, str)
-    assert re.match(r"[0-9]{3}", result)
+    assert isinstance(result, Output)
+    assert re.match(r"[0-9]{3}", result.content)
 
 
 @pytest.mark.asyncio
diff --git a/tests/models/test_tgi_type_adapter.py b/tests/models/test_tgi_type_adapter.py
@@ -2,6 +2,7 @@
 import pytest
 
 from outlines.models.tgi import TGITypeAdapter
+from outlines.tools import ToolDef
 from outlines.types import CFG, JsonSchema
 
 
@@ -86,3 +87,15 @@ def test_tgi_type_adapter_output_type_invalid(
         match="TGI does not support CFG-based structured outputs.",
     ):
         type_adapter.format_output_type(cfg_instance)
+
+
+def test_tgi_type_adapter_tools(type_adapter):
+    with pytest.raises(
+        NotImplementedError,
+        match="Tools are not available for TGI.",
+    ):
+        type_adapter.format_tools(
+            [ToolDef(name="test", description="test", parameters={})]
+        )
+
+    type_adapter.format_tools(None)