generative-computing
diff --git a/‎docs/examples/image_text_models/pointing_up.jpg‎
36.2 KB b/‎docs/examples/image_text_models/pointing_up.jpg‎
36.2 KB
diff --git a/‎docs/examples/image_text_models/vision_litellm_backend.py‎
Lines changed: 38 additions & 0 deletions b/‎docs/examples/image_text_models/vision_litellm_backend.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎docs/examples/image_text_models/vision_ollama_chat.py‎
Lines changed: 20 additions & 0 deletions b/‎docs/examples/image_text_models/vision_ollama_chat.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎docs/examples/image_text_models/vision_openai_examples.py‎
Lines changed: 45 additions & 0 deletions b/‎docs/examples/image_text_models/vision_openai_examples.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎mellea/backends/formatter.py‎
Lines changed: 8 additions & 0 deletions b/‎mellea/backends/formatter.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎mellea/backends/litellm.py‎
Lines changed: 11 additions & 1 deletion b/‎mellea/backends/litellm.py‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎mellea/backends/ollama.py‎
Lines changed: 6 additions & 1 deletion b/‎mellea/backends/ollama.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎mellea/backends/openai.py‎
Lines changed: 38 additions & 5 deletions b/‎mellea/backends/openai.py‎
Lines changed: 38 additions & 5 deletions
diff --git a/‎mellea/stdlib/base.py‎
Lines changed: 89 additions & 0 deletions b/‎mellea/stdlib/base.py‎
Lines changed: 89 additions & 0 deletions
@@ -0,0 +1,38 @@
+"""Examples of using vision models with LiteLLM backend."""
+
+import os
+
+import litellm
+from PIL import Image
+
+from mellea import MelleaSession, start_session
+from mellea.backends.litellm import LiteLLMBackend
+from mellea.backends.openai import OpenAIBackend
+from mellea.stdlib.base import ImageBlock
+
+# use LiteLLM to talk to Ollama or anthropic or.....
+m = MelleaSession(LiteLLMBackend("ollama/granite3.2-vision"))
+# m = MelleaSession(LiteLLMBackend("ollama/llava"))
+# m = MelleaSession(LiteLLMBackend("anthropic/claude-3-haiku-20240307"))
+
+test_pil = Image.open("pointing_up.jpg")
+
+# check if model is able to do text chat
+ch = m.chat("What's 1+1?")
+print(str(ch.content))
+
+# test with PIL image
+res = m.instruct(
+    "Is there a person on the image? Is the subject in the image smiling?",
+    images=[test_pil],
+)
+print(str(res))
+# print(m.last_prompt())
+
+# with PIL image and using m.chat
+res = m.chat("How many eyes can you identify in the image? Explain.", images=[test_pil])
+print(str(res.content))
+
+# and now without images again...
+res = m.instruct("How many eyes can you identify in the image?", images=[])
+print(str(res))
@@ -0,0 +1,20 @@
+"""Example of using Ollama with vision models with linear context."""
+
+from PIL import Image
+
+from mellea import LinearContext, start_session
+from mellea.stdlib.base import ImageBlock
+
+m = start_session(model_id="granite3.2-vision", ctx=LinearContext())
+# m = start_session(model_id="llava", ctx=LinearContext())
+
+# load image
+test_img = Image.open("pointing_up.jpg")
+
+# ask a question about the image
+res = m.instruct("Is the subject in the image smiling?", images=[test_img])
+print(f"Result:{res!s}")
+
+# This instruction should refer to the first image.
+res2 = m.instruct("How many eyes can you identify in the image? Explain.")
+print(f"Result:{res2!s}")
@@ -0,0 +1,45 @@
+"""Examples using vision models with OpenAI backend."""
+
+import os
+
+from PIL import Image
+
+from mellea import MelleaSession
+from mellea.backends.openai import OpenAIBackend
+from mellea.stdlib.base import ImageBlock
+
+# # using anthropic AI model ...
+# anth_key = os.environ.get("ANTHROPIC_API_KEY")
+# m = MelleaSession(OpenAIBackend(model_id="claude-3-haiku-20240307",
+#                                 api_key=anth_key,  # Your Anthropic API key
+#                                 base_url="https://api.anthropic.com/v1/"  # Anthropic's API endpoint
+#                                 ))
+
+# using LM Studio model locally
+m = MelleaSession(
+    OpenAIBackend(model_id="qwen/qwen2.5-vl-7b", base_url="http://127.0.0.1:1234/v1")
+)
+
+# load PIL image and convert to mellea ImageBlock
+test_pil = Image.open("pointing_up.jpg")
+test_img = ImageBlock.from_pil_image(test_pil)
+
+# check if model is able to do text chat
+ch = m.chat("What's 1+1?")
+print(str(ch.content))
+
+# now test with MELLEA image
+res = m.instruct(
+    "Is there a person on the image? Is the subject in the image smiling?",
+    images=[test_img],
+)
+print(str(res))
+# print(m.last_prompt())
+
+# and now with PIL image and using m.chat
+res = m.chat("How many eyes can you identify in the image? Explain.", images=[test_pil])
+print(str(res.content))
+
+# and now without images again...
+res = m.instruct("How many eyes can you identify in the image?", images=[])
+print(str(res))
@@ -71,6 +71,14 @@ def _to_msg(c: Component | CBlock) -> Message:
             match c:
                 case Message():
                     return c
+                case Component():
+                    images = None
+                    tr = c.format_for_llm()
+                    if isinstance(tr, TemplateRepresentation):
+                        images = tr.images
+
+                    # components can have images
+                    return Message(role=role, content=self.print(c), images=images)
                 case _:
                     return Message(role=role, content=self.print(c))
 
 
@@ -12,6 +12,7 @@
 import mellea.backends.model_ids as model_ids
 from mellea.backends import BaseModelSubclass
 from mellea.backends.formatter import Formatter, FormatterBackend, TemplateFormatter
+from mellea.backends.openai import OpenAIBackend
 from mellea.backends.tools import (
     add_tools_from_context_actions,
     add_tools_from_model_options,
@@ -213,18 +214,27 @@ def _generate_from_chat_context_standard(
         )
         # Convert our linearized context into a sequence of chat messages. Template formatters have a standard way of doing this.
         messages: list[Message] = self.formatter.to_chat_messages(linearized_context)
+
         # Add the final message.
         match action:
             case ALoraRequirement():
                 raise Exception("The LiteLLM backend does not support activated LoRAs.")
             case _:
                 messages.extend(self.formatter.to_chat_messages([action]))
 
+        # TODO: the supports_vision function is not reliably predicting if models support vision. E.g., ollama/llava is not a vision model?
+        # if any(m.images is not None for m in messages):
+        #     # check if model can handle images
+        #     assert litellm.supports_vision(
+        #         model=self.model_id), f"Model {self.model_id} does not support vision. Please use a different model."
+
         conversation: list[dict] = []
         system_prompt = model_opts.get(ModelOption.SYSTEM_PROMPT, "")
         if system_prompt != "":
             conversation.append({"role": "system", "content": system_prompt})
-        conversation.extend([{"role": m.role, "content": m.content} for m in messages])
+        conversation.extend(
+            [OpenAIBackend.message_to_openai_message(m) for m in messages]
+        )
 
         if format is not None:
             response_format = {
 
@@ -287,7 +287,12 @@ def generate_from_chat_context(
         if system_prompt != "":
             conversation.append({"role": "system", "content": system_prompt})
 
-        conversation.extend([{"role": m.role, "content": m.content} for m in messages])
+        conversation.extend(
+            [
+                {"role": m.role, "content": m.content, "images": m.images}
+                for m in messages
+            ]
+        )
 
         # Append tool call information if applicable.
         tools: dict[str, Callable] = dict()
 
@@ -350,6 +350,40 @@ def _generate_from_chat_context_alora(
             ),
         )
 
+    @staticmethod
+    def message_to_openai_message(msg: Message):
+        if msg.images is not None:
+            img_list = [
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{img}"},
+                }
+                for img in msg.images
+            ]
+
+            return {
+                "role": msg.role,
+                "content": [{"type": "text", "text": msg.content}, *img_list],
+            }
+        else:
+            return {"role": msg.role, "content": msg.content}
+            # Target format:
+            # {
+            #     "role": "user",
+            #     "content": [
+            #       {
+            #         "type": "text",
+            #         "text": "What's in this picture?"
+            #       },
+            #       {
+            #         "type": "image_url",
+            #         "image_url": {
+            #           "url": "data:image/jpeg;base64,<base64_string>"
+            #         }
+            #       }
+            #     ]
+            #   }
+
     def _generate_from_chat_context_standard(
         self,
         action: Component | CBlock,
@@ -384,7 +418,7 @@ def _generate_from_chat_context_standard(
         system_prompt = model_opts.get(ModelOption.SYSTEM_PROMPT, "")
         if system_prompt != "":
             conversation.append({"role": "system", "content": system_prompt})
-        conversation.extend([{"role": m.role, "content": m.content} for m in messages])
+        conversation.extend([self.message_to_openai_message(m) for m in messages])
 
         if format is not None:
             response_format = {
@@ -420,15 +454,14 @@ def _generate_from_chat_context_standard(
             thinking = "medium"
 
         formatted_tools = convert_tools_to_json(tools)
+        use_tools = len(formatted_tools) > 0
+
         chat_response: ChatCompletion = self._client.chat.completions.create(
             model=self._hf_model_id,
             messages=conversation,  # type: ignore
             reasoning_effort=thinking,  # type: ignore
             response_format=response_format,  # type: ignore
-            tool_choice=(
-                "auto" if formatted_tools and len(formatted_tools) > 0 else "none"
-            ),
-            tools=formatted_tools,  # type: ignore
+            tools=formatted_tools if use_tools else None,  # type: ignore
             # parallel_tool_calls=False, # We only support calling one tool per turn. But we do the choosing on our side so we leave this False.
             **self._make_backend_specific_and_remove(
                 model_opts, is_chat_context=ctx.is_chat_context
 
@@ -3,12 +3,17 @@
 from __future__ import annotations
 
 import abc
+import base64
+import binascii
 import datetime
 from collections.abc import Callable, Iterable, Mapping
 from copy import deepcopy
 from dataclasses import dataclass
+from io import BytesIO
 from typing import Any, Protocol, runtime_checkable
 
+from PIL import Image as PILImage
+
 from mellea.helpers.fancy_logger import FancyLogger
 
 
@@ -43,6 +48,70 @@ def __repr__(self):
         return f"CBlock({self.value}, {self._meta.__repr__()})"
 
 
+class ImageBlock:
+    """A `ImageBlock` represents an image (as base64 PNG)."""
+
+    def __init__(self, value: str, meta: dict[str, Any] | None = None):
+        """Initializes the ImageBlock with a base64 PNG string representation and some metadata."""
+        assert self.is_valid_base64_png(value), (
+            "Invalid base64 string representation of image."
+        )
+        self._value = value
+        self._meta = {} if meta is None else meta
+
+    @staticmethod
+    def is_valid_base64_png(s: str) -> bool:
+        """Checks if a string is a valid base64 string [AIA PAI Nc Hin R v1.0]."""
+        try:
+            # Check if the string has a data URI prefix and remove it.
+            if "data:" in s and "base64," in s:
+                s = s.split("base64,")[1]
+
+            # Add padding if necessary
+            s = s.strip()
+            mod4 = len(s) % 4
+            if mod4 > 0:
+                s = s + "=" * (4 - mod4)
+
+            # Attempt to decode the Base64 string
+            decoded_data = base64.b64decode(s, validate=True)
+
+            # The official PNG signature is 8 bytes long.
+            png_signature = b"\x89PNG\r\n\x1a\n"
+
+            if decoded_data.startswith(png_signature):
+                return True
+            else:
+                return False
+
+            return True
+        except (binascii.Error, ValueError):
+            return False
+
+    @staticmethod
+    def pil_to_base64(image: PILImage.Image) -> str:
+        """Converts a PIL image to a base64 string representation."""
+        img_io = BytesIO()
+        image.save(img_io, "PNG")
+        return base64.b64encode(img_io.getvalue()).decode("utf-8")
+
+    @classmethod
+    def from_pil_image(
+        cls, image: PILImage.Image, meta: dict[str, Any] | None = None
+    ) -> ImageBlock:
+        """Converts a PIL image to a base64 string representation."""
+        image_base64 = cls.pil_to_base64(image)
+        return cls(image_base64, meta)
+
+    def __str__(self):
+        """Stringifies the block."""
+        return self._value
+
+    def __repr__(self):
+        """Provides a python-parsable representation of the block (usually)."""
+        return f"ImageBlock({self._value}, {self._meta.__repr__()})"
+
+
 @runtime_checkable
 class Component(Protocol):
     """A `Component` is a composite data structure that is intended to be represented to an LLM."""
@@ -59,6 +128,25 @@ def format_for_llm(self) -> TemplateRepresentation | str:
         raise NotImplementedError("format_for_llm isn't implemented by default")
 
 
+def get_images_from_component(c: Component) -> None | list[ImageBlock]:
+    """Gets images from a `Component` if they are present and a non-empty list, otherwise returns None."""
+    if hasattr(c, "images"):
+        imgs = c.images
+        if imgs is not None:
+            assert isinstance(imgs, list), "images field must be a list."
+            assert all(isinstance(im, ImageBlock) for im in imgs), (
+                "all elements of images list must be ImageBlocks."
+            )
+            if len(imgs) == 0:
+                return None
+            else:
+                return imgs
+        else:
+            return None
+    else:
+        return None
+
+
 class ModelOutputThunk(CBlock):
     """A `ModelOutputThunk` is a special type of `CBlock` that we know came from a model's output. It is possible to instantiate one without the output being computed yet."""
 
@@ -452,6 +540,7 @@ class TemplateRepresentation:
     fields: list[Any] | None = None
     template: str | None = None
     template_order: list[str] | None = None
+    images: list[ImageBlock] | None = None
 
 
 @dataclass