Add multimodal support

Anri-Lombard · Anri-Lombard · commit 42e4505fb1c5 · 2025-12-01T20:34:44.000+02:00
diff --git a/outlines/models/lmstudio.py b/outlines/models/lmstudio.py
@@ -11,7 +11,7 @@
     cast,
 )
 
-from outlines.inputs import Chat
+from outlines.inputs import Chat, Image
 from outlines.models.base import AsyncModel, Model, ModelTypeAdapter
 from outlines.types import CFG, JsonSchema, Regex
 
@@ -22,10 +22,20 @@
 
 
 class LMStudioTypeAdapter(ModelTypeAdapter):
-    """Type adapter for the `LMStudio` model.
+    """Type adapter for the `LMStudio` model."""
 
-    TODO: Add multimodal (Image) support.
-    """
+    def _prepare_lmstudio_image(self, image: Image):
+        """Convert Outlines Image to LMStudio image handle.
+
+        LMStudio's SDK only accepts file paths, raw bytes, or binary IO objects.
+        Unlike Ollama which accepts base64 directly, we must decode from base64.
+        """
+        import base64
+
+        import lmstudio as lms
+
+        image_bytes = base64.b64decode(image.image_str)
+        return lms.prepare_image(image_bytes)
 
     @singledispatchmethod
     def format_input(self, model_input):
@@ -44,17 +54,33 @@ def format_input(self, model_input):
         """
         raise TypeError(
             f"The input type {type(model_input)} is not available with "
-            "LMStudio. The only available types are `str` and `Chat`."
+            "LMStudio. The only available types are `str`, `list` and `Chat`."
         )
 
     @format_input.register(str)
     def format_str_model_input(self, model_input: str) -> str:
         """Pass through string input directly to LMStudio."""
         return model_input
 
+    @format_input.register(list)
+    def format_list_model_input(self, model_input: list) -> "LMStudioChat":
+        """Handle list input containing prompt and images."""
+        from lmstudio import Chat as LMSChat
+
+        prompt = model_input[0]
+        images = model_input[1:]
+
+        if not all(isinstance(img, Image) for img in images):
+            raise ValueError("All assets provided must be of type Image")
+
+        chat = LMSChat()
+        image_handles = [self._prepare_lmstudio_image(img) for img in images]
+        chat.add_user_message(prompt, images=image_handles)
+        return chat
+
     @format_input.register(Chat)
     def format_chat_model_input(self, model_input: Chat) -> "LMStudioChat":
-        """Convert Outlines Chat to LMStudio Chat."""
+        """Convert Outlines Chat to LMStudio Chat with image support."""
         from lmstudio import Chat as LMSChat
 
         system_prompt = None
@@ -71,7 +97,15 @@ def format_chat_model_input(self, model_input: Chat) -> "LMStudioChat":
             content = message["content"]
 
             if role == "user":
-                chat.add_user_message(content)
+                if isinstance(content, str):
+                    chat.add_user_message(content)
+                elif isinstance(content, list):
+                    prompt = content[0]
+                    images = content[1:]
+                    if not all(isinstance(img, Image) for img in images):
+                        raise ValueError("All assets provided must be of type Image")
+                    image_handles = [self._prepare_lmstudio_image(img) for img in images]
+                    chat.add_user_message(prompt, images=image_handles)
             elif role == "assistant":
                 chat.add_assistant_response(content)
 
@@ -82,9 +116,6 @@ def format_output_type(
     ) -> Optional[dict]:
         """Format the output type to pass to the model.
 
-        TODO: `int`, `float` and other Python types could be supported via
-        JSON Schema.
-
         Parameters
         ----------
         output_type
@@ -144,7 +175,7 @@ def __init__(self, client: "Client", model_name: Optional[str] = None):
 
     def generate(
         self,
-        model_input: Chat | str,
+        model_input: Chat | str | list,
         output_type: Optional[Any] = None,
         **kwargs: Any,
     ) -> str:
@@ -194,7 +225,7 @@ def generate_batch(
 
     def generate_stream(
         self,
-        model_input: Chat | str,
+        model_input: Chat | str | list,
         output_type: Optional[Any] = None,
         **kwargs: Any,
     ) -> Iterator[str]:
@@ -262,7 +293,7 @@ def __init__(
 
     async def generate(
         self,
-        model_input: Chat | str,
+        model_input: Chat | str | list,
         output_type: Optional[Any] = None,
         **kwargs: Any,
     ) -> str:
@@ -316,7 +347,7 @@ async def generate_batch(
 
     async def generate_stream(  # type: ignore
         self,
-        model_input: Chat | str,
+        model_input: Chat | str | list,
         output_type: Optional[Any] = None,
         **kwargs: Any,
     ) -> AsyncIterator[str]:
diff --git a/tests/models/test_lmstudio.py b/tests/models/test_lmstudio.py
@@ -1,13 +1,15 @@
+import io
 import json
 from enum import Enum
 from typing import Annotated
 
 import lmstudio as lms
 import pytest
+from PIL import Image as PILImage
 from pydantic import BaseModel, Field
 
 import outlines
-from outlines.inputs import Chat
+from outlines.inputs import Chat, Image, Video
 from outlines.models import AsyncLMStudio, LMStudio
 
 MODEL_NAME = "qwen2.5-coder-1.5b-instruct-mlx"
@@ -37,6 +39,21 @@ def async_model_no_model_name():
     return AsyncLMStudio(client)
 
 
+@pytest.fixture(scope="session")
+def image():
+    width, height = 1, 1
+    white_background = (255, 255, 255)
+    image = PILImage.new("RGB", (width, height), white_background)
+
+    # Save to an in-memory bytes buffer and read as png
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    buffer.seek(0)
+    image = PILImage.open(buffer)
+
+    return image
+
+
 def test_lmstudio_init_from_client():
     client = lms.get_default_client()
 
@@ -87,6 +104,34 @@ def test_lmstudio_call(model):
     assert isinstance(result, str)
 
 
+@pytest.mark.api_call
+def test_lmstudio_simple_vision(image, model):
+    # This is not using a vision model, so it's not able to describe
+    # the image, but we're still checking the model input syntax
+    result = model.generate(
+        ["What does this logo represent?", Image(image)],
+        model=MODEL_NAME,
+    )
+    assert isinstance(result, str)
+
+
+@pytest.mark.api_call
+def test_lmstudio_chat_with_image(image, model):
+    result = model.generate(
+        Chat(
+            [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": [
+                    "What does this logo represent?",
+                    Image(image)
+                ]},
+            ]
+        ),
+        model=MODEL_NAME,
+    )
+    assert isinstance(result, str)
+
+
 @pytest.mark.api_call
 def test_lmstudio_chat(model):
     chat = Chat(messages=[
@@ -118,10 +163,13 @@ class Foo(Enum):
 
 
 @pytest.mark.api_call
-def test_lmstudio_wrong_input_type(model):
+def test_lmstudio_wrong_input_type(model, image):
     with pytest.raises(TypeError, match="is not available"):
         model.generate({"foo?": "bar?"}, None)
 
+    with pytest.raises(ValueError, match="All assets provided must be of type Image"):
+        model.generate(["foo?", Image(image), Video("")], None)
+
 
 @pytest.mark.api_call
 def test_lmstudio_stream(model):
@@ -198,6 +246,36 @@ async def test_lmstudio_async_call(async_model):
     assert isinstance(result, str)
 
 
+@pytest.mark.api_call
+@pytest.mark.asyncio
+async def test_lmstudio_async_simple_vision(image, async_model):
+    # This is not using a vision model, so it's not able to describe
+    # the image, but we're still checking the model input syntax
+    result = await async_model.generate(
+        ["What does this logo represent?", Image(image)],
+        model=MODEL_NAME,
+    )
+    assert isinstance(result, str)
+
+
+@pytest.mark.api_call
+@pytest.mark.asyncio
+async def test_lmstudio_async_chat_with_image(image, async_model):
+    result = await async_model.generate(
+        Chat(
+            [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": [
+                    "What does this logo represent?",
+                    Image(image)
+                ]},
+            ]
+        ),
+        model=MODEL_NAME,
+    )
+    assert isinstance(result, str)
+
+
 @pytest.mark.api_call
 @pytest.mark.asyncio
 async def test_lmstudio_async_chat(async_model):
@@ -233,10 +311,13 @@ class Foo(Enum):
 
 @pytest.mark.api_call
 @pytest.mark.asyncio
-async def test_lmstudio_async_wrong_input_type(async_model):
+async def test_lmstudio_async_wrong_input_type(async_model, image):
     with pytest.raises(TypeError, match="is not available"):
         await async_model.generate({"foo?": "bar?"}, None)
 
+    with pytest.raises(ValueError, match="All assets provided must be of type Image"):
+        await async_model.generate(["foo?", Image(image), Video("")], None)
+
 
 @pytest.mark.api_call
 @pytest.mark.asyncio
diff --git a/tests/models/test_lmstudio_type_adapter.py b/tests/models/test_lmstudio_type_adapter.py
@@ -1,12 +1,14 @@
+import io
 import json
 import sys
 from dataclasses import dataclass
 
 import pytest
 from genson import SchemaBuilder
+from PIL import Image as PILImage
 from pydantic import BaseModel
 
-from outlines.inputs import Chat
+from outlines.inputs import Chat, Image
 from outlines.models.lmstudio import LMStudioTypeAdapter
 from outlines.types import cfg, json_schema, regex
 
@@ -34,13 +36,37 @@ def adapter():
     return LMStudioTypeAdapter()
 
 
+@pytest.fixture
+def image():
+    width, height = 1, 1
+    white_background = (255, 255, 255)
+    image = PILImage.new("RGB", (width, height), white_background)
+
+    # Save to an in-memory bytes buffer and read as png
+    buffer = io.BytesIO()
+    image.save(buffer, format="PNG")
+    buffer.seek(0)
+    image = PILImage.open(buffer)
+
+    return image
+
+
 def test_lmstudio_type_adapter_input_text(adapter):
     text_input = "prompt"
     result = adapter.format_input(text_input)
     assert isinstance(result, str)
     assert result == text_input
 
 
+def test_lmstudio_type_adapter_input_vision(adapter, image):
+    import lmstudio as lms
+
+    image_input = Image(image)
+    text_input = "prompt"
+    result = adapter.format_input([text_input, image_input])
+    assert isinstance(result, lms.Chat)
+
+
 def test_lmstudio_type_adapter_input_chat(adapter):
     chat_input = Chat(messages=[
         {"role": "system", "content": "You are a helpful assistant."},
@@ -66,6 +92,22 @@ def test_lmstudio_type_adapter_input_chat_no_system(adapter):
     assert isinstance(result, lms.Chat)
 
 
+def test_lmstudio_type_adapter_input_chat_with_image(adapter, image):
+    import lmstudio as lms
+
+    image_input = Image(image)
+    chat_input = Chat(messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+            "What is in this image?",
+            image_input,
+        ]},
+        {"role": "assistant", "content": "response"},
+    ])
+    result = adapter.format_input(chat_input)
+    assert isinstance(result, lms.Chat)
+
+
 def test_lmstudio_type_adapter_input_invalid(adapter):
     prompt = {"foo": "bar"}
     with pytest.raises(TypeError, match="The input type"):