agent-framework/python/samples/02-agents/providers/ollama/ollama_chat_multimodal.py at main · microsoft/agent-framework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Copyright (c) Microsoft. All rights reserved.

import asyncio

from agent_framework import Content, Message
from agent_framework.ollama import OllamaChatClient
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

"""
Ollama Agent Multimodal Example

This sample demonstrates implementing a Ollama agent with multimodal input capabilities.

Ensure to install Ollama and have a model running locally before running the sample
Not all Models support multimodal input, to test multimodal input try gemma3:4b
Set the model to use via the OLLAMA_MODEL environment variable or modify the code below.
https://ollama.com/

"""


def create_sample_image() -> str:
    """Create a simple 1x1 pixel PNG image for testing."""
    # This is a tiny red pixel in PNG format
    png_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
    return f"data:image/png;base64,{png_data}"


async def test_image() -> None:
    """Test image analysis with Ollama."""

    client = OllamaChatClient()

    image_uri = create_sample_image()

    message = Message(
        role="user",
        contents=[
            Content.from_text(text="What's in this image?"),
            Content.from_uri(uri=image_uri, media_type="image/png"),
        ],
    )

    response = await client.get_response([message])
    print(f"Image Response: {response}")


async def main() -> None:
    print("=== Testing Ollama Multimodal ===")
    await test_image()


if __name__ == "__main__":
    asyncio.run(main())