Implement tools and outputs for the LlamaCpp model

RobinPicard · RobinPicard · commit 11f089171485 · 2025-09-11T16:59:03.000+02:00
diff --git a/outlines/models/llamacpp.py b/outlines/models/llamacpp.py
@@ -17,7 +17,9 @@
 from outlines.inputs import Chat
 from outlines.models.base import Model, ModelTypeAdapter
 from outlines.models.tokenizer import Tokenizer
+from outlines.outputs import Output, StreamingOutput
 from outlines.processors import OutlinesLogitsProcessor
+from outlines.tools import ToolDef
 
 if TYPE_CHECKING:
     from llama_cpp import Llama, LogitsProcessorList
@@ -196,7 +198,7 @@ def format_chat_input(self, model_input: Chat) -> list:
         ]
 
     def format_output_type(
-        self, output_type: Optional[OutlinesLogitsProcessor] = None,
+        self, output_type: Optional[OutlinesLogitsProcessor],
     ) -> "LogitsProcessorList":
         """Generate the logits processor argument to pass to the model.
 
@@ -215,6 +217,13 @@ def format_output_type(
 
         return LogitsProcessorList([output_type])
 
+    def format_tools(self, tools):
+        """Not available for LlamaCpp."""
+        if tools:
+            raise NotImplementedError(
+                "LlamaCpp does not support tools."
+            )
+
 
 class LlamaCpp(Model):
     """Thin wrapper around the `llama_cpp.Llama` model.
@@ -240,9 +249,10 @@ def __init__(self, model: "Llama"):
     def generate(
         self,
         model_input: Union[Chat, str],
-        output_type: Optional[OutlinesLogitsProcessor] = None,
+        output_type: Optional[OutlinesLogitsProcessor],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> str:
+    ) -> Output:
         """Generate text using `llama-cpp-python`.
 
         Parameters
@@ -252,6 +262,8 @@ def generate(
         output_type
             The logits processor the model will use to constrain the format of
             the generated text.
+        tools
+            The tools to use for the generation.
         **inference_kwargs
             Additional keyword arguments to pass to the `Llama.__call__`
             method of the `llama-cpp-python` library.
@@ -262,41 +274,46 @@ def generate(
             The text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         prompt = self.type_adapter.format_input(model_input)
+        logits_processor = self.type_adapter.format_output_type(output_type)
 
         if isinstance(prompt, str):
             completion = self.model(
                 prompt,
-                logits_processor=self.type_adapter.format_output_type(output_type),
+                logits_processor=logits_processor,
                 **inference_kwargs,
             )
             result = completion["choices"][0]["text"]
         elif isinstance(prompt, list): # pragma: no cover
             completion = self.model.create_chat_completion(
                 prompt,
-                logits_processor=self.type_adapter.format_output_type(output_type),
+                logits_processor=logits_processor,
                 **inference_kwargs,
             )
             result = completion["choices"][0]["message"]["content"]
 
         self.model.reset()
 
-        return result
+        return Output(content=result)
 
     def generate_batch(
         self,
         model_input,
-        output_type = None,
+        output_type,
         **inference_kwargs,
     ):
-        raise NotImplementedError("LlamaCpp does not support batch generation.")
+        raise NotImplementedError(
+            "LlamaCpp does not support batch generation."
+        )
 
     def generate_stream(
         self,
         model_input: Union[Chat, str],
-        output_type: Optional[OutlinesLogitsProcessor] = None,
+        output_type: Optional[OutlinesLogitsProcessor],
+        tools: Optional[List[ToolDef]],
         **inference_kwargs: Any,
-    ) -> Iterator[str]:
+    ) -> Iterator[StreamingOutput]:
         """Stream text using `llama-cpp-python`.
 
         Parameters
@@ -306,6 +323,8 @@ def generate_stream(
         output_type
             The logits processor the model will use to constrain the format of
             the generated text.
+        tools
+            The tools to use for the generation.
         **inference_kwargs
             Additional keyword arguments to pass to the `Llama.__call__`
             method of the `llama-cpp-python` library.
@@ -316,27 +335,33 @@ def generate_stream(
             An iterator that yields the text generated by the model.
 
         """
+        self.type_adapter.format_tools(tools)
         prompt = self.type_adapter.format_input(model_input)
+        logits_processor = self.type_adapter.format_output_type(output_type)
 
         if isinstance(prompt, str):
             generator = self.model(
                 prompt,
-                logits_processor=self.type_adapter.format_output_type(output_type),
+                logits_processor=logits_processor,
                 stream=True,
                 **inference_kwargs,
             )
             for chunk in generator:
-                yield chunk["choices"][0]["text"]
+                yield StreamingOutput(
+                    content=chunk["choices"][0]["text"]
+                )
 
         elif isinstance(prompt, list): # pragma: no cover
             generator = self.model.create_chat_completion(
                 prompt,
-                logits_processor=self.type_adapter.format_output_type(output_type),
+                logits_processor=logits_processor,
                 stream=True,
                 **inference_kwargs,
             )
             for chunk in generator:
-                yield chunk["choices"][0]["delta"].get("content", "")
+                yield StreamingOutput(
+                    content=chunk["choices"][0]["delta"].get("content", "")
+                )
 
 
 def from_llamacpp(model: "Llama"):
diff --git a/tests/models/test_llamacpp.py b/tests/models/test_llamacpp.py
@@ -12,6 +12,7 @@
     LlamaCppTypeAdapter,
     from_llamacpp
 )
+from outlines.outputs import Output, StreamingOutput
 from outlines.types.dsl import Regex, CFG
 
 
@@ -71,12 +72,12 @@ def ebnf_grammar():
 
 
 def test_llamacpp_simple(model):
-    result = model.generate("Respond with one word. Not more.", None)
-    assert isinstance(result, str)
+    result = model("Respond with one word. Not more.", None)
+    assert isinstance(result, Output)
 
 
 def test_llamacpp_chat(model):
-    result = model.generate(
+    result = model(
         Chat(
             messages=[
                 {"role": "system", "content": "You are a helpful assistant."},
@@ -85,23 +86,23 @@ def test_llamacpp_chat(model):
         ),
         max_tokens=10
     )
-    assert isinstance(result, str)
+    assert isinstance(result, Output)
 
 
 def test_llamacpp_regex(model):
     result = model("Respond with one word. Not more.", Regex(r"[0-9]"))
-    assert isinstance(result, str)
-    assert int(result)
-    assert len(result) == 1
+    assert isinstance(result, Output)
+    assert int(result.content)
+    assert len(result.content) == 1
 
 
 def test_llamacpp_json(model):
     class Foo(BaseModel):
         bar: str
 
     result = model("foo? Respond with one word.", Foo, max_tokens=100)
-    assert isinstance(result, str)
-    assert "bar" in json.loads(result)
+    assert isinstance(result, Output)
+    assert "bar" in json.loads(result.content)
 
 
 def test_llamacpp_choice(model):
@@ -110,12 +111,14 @@ class Foo(Enum):
         foor = "Foo"
 
     result = model("foo?", Foo)
-    assert result == "Foo" or result == "Bar"
+    assert isinstance(result, Output)
+    assert result.content == "Foo" or result.content == "Bar"
 
 
 def test_llamacpp_cfg(model, ebnf_grammar):
     response = model("Respond with one word. Not more.", CFG(ebnf_grammar))
-    assert response in ["yes", "no"]
+    assert isinstance(response, Output)
+    assert response.content in ["yes", "no"]
 
 
 def test_llamacpp_cfg_outlines_core(model, lark_grammar):
@@ -131,15 +134,16 @@ def test_llamacpp_cfg_outlines_core(model, lark_grammar):
 
 
 def test_llamacpp_text_stop(model):
-    result = model.generate("Write the letter a.", None, stop="a", max_tokens=100)
-    assert "a" not in result
+    result = model("Write the letter a.", None, stop="a", max_tokens=100)
+    assert isinstance(result, Output)
+    assert "a" not in result.content
 
 
 def test_llamacpp_stream_simple(model):
     generator = model.stream("Respond with one word. Not more.", None)
 
     for x in generator:
-        assert isinstance(x, str)
+        assert isinstance(x, StreamingOutput)
 
 
 def test_llamacpp_stream_chat(model):
@@ -153,14 +157,16 @@ def test_llamacpp_stream_chat(model):
         max_tokens=10
     )
     for x in generator:
-        assert isinstance(x, str)
+        assert isinstance(x, StreamingOutput)
 
 
 def test_llamacpp_stream_regex(model):
     generator = model.stream("Respond with one word. Not more.", Regex(r"[0-9]"))
 
     x = next(generator)
-    assert isinstance(x, str)
+    assert isinstance(x, StreamingOutput)
+    assert int(x.content)
+    assert len(x.content) == 1
 
 
 def test_llamacpp_stream_json(model):
@@ -170,15 +176,17 @@ class Foo(BaseModel):
     generator = model.stream("foo?", Foo)
 
     x = next(generator)
-    assert x == "{"
+    assert isinstance(x, StreamingOutput)
+    assert "{" in x.content
 
 
 def test_llamacpp_stream_cfg(model, ebnf_grammar):
     response = ""
     for chunk in model.stream(
         "Respond with one word. Not more.", CFG(ebnf_grammar)
     ):
-        response += chunk
+        assert isinstance(chunk, StreamingOutput)
+        response += chunk.content
     assert response in ["yes", "no"]
 
 
@@ -187,7 +195,7 @@ def test_llamacpp_stream_cfg_outlines_core(model, lark_grammar):
         NotImplementedError,
         match="Outlines Core does not support context-free grammar."
     ):
-        for chunk in model.stream(
+        for _ in model.stream(
             "Respond with one word. Not more.",
             CFG(lark_grammar),
             backend="outlines_core"
@@ -203,15 +211,16 @@ class Foo(Enum):
     generator = model.stream("foo?", Foo)
 
     x = next(generator)
-    assert x[0] in ("B", "F")
+    assert isinstance(x, StreamingOutput)
+    assert x.content[0] in ("B", "F")
 
 
 def test_llamacpp_stream_text_stop(model):
     generator = model.stream("Write the letter a.", None, stop="a", max_tokens=100)
 
     result = next(generator)
-    assert isinstance(result, str)
-    assert result != "a"
+    assert isinstance(result, StreamingOutput)
+    assert result.content != "a"
 
 
 def test_llamacpp_batch(model):
diff --git a/tests/models/test_llamacpp_type_adapter.py b/tests/models/test_llamacpp_type_adapter.py
@@ -8,6 +8,7 @@
 from outlines.backends.outlines_core import OutlinesCoreLogitsProcessor
 from outlines.inputs import Chat, Image
 from outlines.models.llamacpp import LlamaCppTypeAdapter
+from outlines.tools import ToolDef
 
 
 @pytest.fixture
@@ -67,3 +68,15 @@ def test_llamacpp_type_adapter_format_output_type(adapter, logits_processor):
     assert isinstance(formatted, LogitsProcessorList)
     assert formatted[0].index == logits_processor.index
     assert formatted[0].tensor_library_name == logits_processor.tensor_library_name
+
+
+def test_llamacpp_type_adapter_tools(adapter):
+    with pytest.raises(
+        NotImplementedError,
+        match="LlamaCpp does not support tools."
+    ):
+        adapter.format_tools(
+            [ToolDef(name="test", description="test", parameters={})]
+        )
+
+    adapter.format_tools(None)