Implement tools and outputs for the SgLang model

RobinPicard · RobinPicard · commit 39a474d2dd0f · 2025-09-11T16:59:03.000+02:00
diff --git a/outlines/models/sglang.py b/outlines/models/sglang.py
@@ -3,12 +3,14 @@
 import json
 import warnings
 from typing import (
-    TYPE_CHECKING, Any, AsyncIterator, Iterator, Optional, Union
+    TYPE_CHECKING, Any, AsyncIterator, Iterator, List, Optional, Union
 )
 
 from outlines.inputs import Chat
 from outlines.models.base import AsyncModel, Model, ModelTypeAdapter
 from outlines.models.openai import OpenAITypeAdapter
+from outlines.outputs import Output, StreamingOutput
+from outlines.tools import ToolDef
 from outlines.types.dsl import (
     CFG,
     JsonSchema,
@@ -44,7 +46,7 @@ def format_input(self, model_input: Union[Chat, list, str]) -> list:
         """
         return OpenAITypeAdapter().format_input(model_input)
 
-    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
+    def format_output_type(self, output_type: Optional[Any]) -> dict:
         """Generate the structured output argument to pass to the client.
 
         Parameters
@@ -78,6 +80,13 @@ def format_output_type(self, output_type: Optional[Any] = None) -> dict:
         else:
             return {"extra_body": {"regex": to_regex(term)}}
 
+    def format_tools(self, tools):
+        """Not available for SGLang."""
+        if tools:
+            raise NotImplementedError(
+                "Tools are not available for SGLang."
+            )
+
 
 class SGLang(Model):
     """Thin wrapper around the `openai.OpenAI` client used to communicate with
@@ -106,9 +115,10 @@ def __init__(self, client, model_name: Optional[str] = None):
     def generate(
         self,
         model_input: Union[Chat, list, str],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> Union[str, list[str]]:
+    ) -> Output | list[Output]:
         """Generate text using SGLang.
 
         Parameters
@@ -119,15 +129,18 @@ def generate(
             The desired format of the response generated by the model. All
             output types available in Outlines are supported provided your
             server uses a structured generation backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        Union[str, list[str]]
+        Output | list[Output]
             The text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         client_args = self._build_client_args(
             model_input,
             output_type,
@@ -145,14 +158,15 @@ def generate(
                 )
 
         if len(messages) == 1:
-            return messages[0].content
+            return Output(content=messages[0].content)
         else:
-            return [message.content for message in messages]
+            return [Output(content=message.content) for message in messages]
 
     def generate_batch(
         self,
         model_input,
-        output_type = None,
+        output_type,
+        tools,
         **inference_kwargs,
     ):
         raise NotImplementedError(
@@ -162,9 +176,10 @@ def generate_batch(
     def generate_stream(
         self,
         model_input: Union[Chat, list, str],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> Iterator[str]:
+    ) -> Iterator[StreamingOutput]:
         """Stream text using SGLang.
 
         Parameters
@@ -175,15 +190,18 @@ def generate_stream(
             The desired format of the response generated by the model. All
             output types available in Outlines are supported provided your
             server uses a structured generation backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        Iterator[str]
+        Iterator[StreamingOutput]
             An iterator that yields the text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         client_args = self._build_client_args(
             model_input, output_type, **inference_kwargs,
         )
@@ -194,12 +212,12 @@ def generate_stream(
 
         for chunk in stream:  # pragma: no cover
             if chunk.choices and chunk.choices[0].delta.content is not None:
-                yield chunk.choices[0].delta.content
+                yield StreamingOutput(content=chunk.choices[0].delta.content)
 
     def _build_client_args(
         self,
         model_input: Union[Chat, str, list],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
         **inference_kwargs: Any,
     ) -> dict:
         """Build the arguments to pass to the SGLang client."""
@@ -250,9 +268,10 @@ def __init__(self, client, model_name: Optional[str] = None):
     async def generate(
         self,
         model_input: Union[Chat, str, list],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> Union[str, list[str]]:
+    ) -> Union[Output, list[Output]]:
         """Generate text using `sglang`.
 
         Parameters
@@ -263,15 +282,18 @@ async def generate(
             The desired format of the response generated by the model. All
             output types available in Outlines are supported provided your
             server uses a structured generation backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        Union[str, list[str]]
+        Union[Output, list[Output]]
             The text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         client_args = self._build_client_args(
             model_input, output_type, **inference_kwargs,
         )
@@ -287,14 +309,15 @@ async def generate(
                 )
 
         if len(messages) == 1:
-            return messages[0].content
+            return Output(content=messages[0].content)
         else:
-            return [message.content for message in messages]
+            return [Output(content=message.content) for message in messages]
 
     async def generate_batch(
         self,
         model_input,
-        output_type = None,
+        output_type,
+        tools,
         **inference_kwargs,
     ):
         raise NotImplementedError(
@@ -304,9 +327,10 @@ async def generate_batch(
     async def generate_stream( # type: ignore
         self,
         model_input: Union[Chat, str, list],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> AsyncIterator[str]:
+    ) -> AsyncIterator[StreamingOutput]:
         """Return a text generator.
 
         Parameters
@@ -317,15 +341,18 @@ async def generate_stream( # type: ignore
             The desired format of the response generated by the model. All
             output types available in Outlines are supported provided your
             server uses a structured generation backend that supports them.
+        tools
+            The tools to use for the generation.
         inference_kwargs
             Additional keyword arguments to pass to the client.
 
         Returns
         -------
-        AsyncIterator[str]
+        AsyncIterator[StreamingOutput]
             An async iterator that yields the text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         client_args = self._build_client_args(
             model_input, output_type, **inference_kwargs,
         )
@@ -337,12 +364,12 @@ async def generate_stream( # type: ignore
 
         async for chunk in stream:  # pragma: no cover
             if chunk.choices and chunk.choices[0].delta.content is not None:
-                yield chunk.choices[0].delta.content
+                yield StreamingOutput(content=chunk.choices[0].delta.content)
 
     def _build_client_args(
         self,
         model_input: Union[Chat, str, list],
-        output_type: Optional[Any] = None,
+        output_type: Optional[Any],
         **inference_kwargs: Any,
     ) -> dict:
         """Build the arguments to pass to the SGLang client."""
diff --git a/tests/models/test_sglang.py b/tests/models/test_sglang.py
@@ -15,6 +15,7 @@
 
 from outlines.inputs import Chat, Image
 from outlines.models.sglang import SGLang, AsyncSGLang, from_sglang
+from outlines.outputs import Output, StreamingOutput
 from outlines.types.dsl import CFG, Regex, JsonSchema
 from tests.test_utils.mock_openai_client import MockOpenAIClient, MockAsyncOpenAIClient
 
@@ -231,7 +232,7 @@ def test_sglang_init():
 
 def test_sglang_sync_simple_call(sync_model):
     result = sync_model("Respond with a single word.",)
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 def test_sglang_sync_streaming(sync_model_no_model_name):
@@ -240,7 +241,7 @@ def test_sglang_sync_streaming(sync_model_no_model_name):
         model=sglang_model_name,
     )
     assert isinstance(result, Generator)
-    assert isinstance(next(result), str)
+    assert isinstance(next(result), StreamingOutput)
 
 
 def test_sglang_sync_batch(sync_model):
@@ -252,7 +253,7 @@ def test_sglang_sync_batch(sync_model):
 
 def test_sglang_sync_vision(sync_model):
     result = sync_model(["hello", image_input], max_tokens=10)
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 def test_sglang_sync_vision_chat(sync_model):
@@ -267,15 +268,15 @@ def test_sglang_sync_vision_chat(sync_model):
         ]),
         max_tokens=10,
     )
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 def test_sglang_sync_multiple_samples(sync_model):
     result = sync_model("Respond with a single word.", n=2)
     assert isinstance(result, list)
     assert len(result) == 2
-    assert isinstance(result[0], str)
-    assert isinstance(result[1], str)
+    assert isinstance(result[0], Output)
+    assert isinstance(result[1], Output)
 
 
 def test_sglang_sync_json(sync_model):
@@ -284,14 +285,14 @@ def test_sglang_sync_json(sync_model):
         + ' {"bar": {"type": "string"}}}'
     )
     result = sync_model("foo?", JsonSchema(json_string), max_tokens=10)
-    assert isinstance(result, str)
-    assert "bar" in result
+    assert isinstance(result, Output)
+    assert "bar" in result.content
 
 
 def test_sglang_sync_regex(sync_model):
     result = sync_model("foo?", Regex(r"[0-9]{3}"), max_tokens=10)
-    assert isinstance(result, str)
-    assert re.match(r"[0-9]{3}", result)
+    assert isinstance(result, Output)
+    assert re.match(r"[0-9]{3}", result.content)
 
 
 def test_sglang_sync_cfg(sync_model):
@@ -300,14 +301,14 @@ def test_sglang_sync_cfg(sync_model):
         match="SGLang grammar-based structured outputs expects an EBNF"
     ):
         result = sync_model("foo?", CFG(EBNF_YES_NO_GRAMMAR), max_tokens=10)
-        assert isinstance(result, str)
-        assert result in ["yes", "no"]
+        assert isinstance(result, Output)
+        assert result.content in ["yes", "no"]
 
 
 @pytest.mark.asyncio
 async def test_sglang_async_simple_call(async_model):
     result = await async_model("Respond with a single word.",)
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 @pytest.mark.asyncio
@@ -318,7 +319,7 @@ async def test_sglang_async_streaming(async_model_no_model_name):
     )
     assert isinstance(result, AsyncGenerator)
     async for chunk in result:
-        assert isinstance(chunk, str)
+        assert isinstance(chunk, StreamingOutput)
         break  # Just check the first chunk
 
 
@@ -333,7 +334,7 @@ async def test_sglang_async_batch(async_model):
 @pytest.mark.asyncio
 async def test_sglang_async_vision(async_model):
     result = await async_model(["hello", image_input], max_tokens=10)
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 @pytest.mark.asyncio
@@ -349,16 +350,16 @@ async def test_sglang_async_vision_chat(async_model):
         ]),
         max_tokens=10,
     )
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 @pytest.mark.asyncio
 async def test_sglang_async_multiple_samples(async_model):
     result = await async_model("Respond with a single word.", n=2)
     assert isinstance(result, list)
     assert len(result) == 2
-    assert isinstance(result[0], str)
-    assert isinstance(result[1], str)
+    assert isinstance(result[0], Output)
+    assert isinstance(result[1], Output)
 
 
 @pytest.mark.asyncio
@@ -368,19 +369,19 @@ async def test_sglang_async_json(async_model):
         + ' {"bar": {"type": "string"}}}'
     )
     result = await async_model("foo?", JsonSchema(json_string), max_tokens=10)
-    assert isinstance(result, str)
-    assert "bar" in result
+    assert isinstance(result, Output)
+    assert "bar" in result.content
 
 
 @pytest.mark.asyncio
 async def test_sglang_async_regex(async_model):
     result = await async_model("foo?", Regex(r"[0-9]{3}"), max_tokens=10)
-    assert isinstance(result, str)
-    assert re.match(r"[0-9]{3}", result)
+    assert isinstance(result, Output)
+    assert re.match(r"[0-9]{3}", result.content)
 
 
 @pytest.mark.asyncio
 async def test_sglang_async_cfg(async_model):
     result = await async_model("foo?", CFG(EBNF_YES_NO_GRAMMAR), max_tokens=10)
-    assert isinstance(result, str)
-    assert result in ["yes", "no"]
+    assert isinstance(result, Output)
+    assert result.content in ["yes", "no"]
diff --git a/tests/models/test_sglang_type_adapter.py b/tests/models/test_sglang_type_adapter.py