agent-lightning/examples/chartqa/multimodal_utils.py at main · microsoft/agent-lightning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Copyright (c) Microsoft. All rights reserved.

"""
Multimodal support utilities for Agent Lightning.

This module provides helper functions for working with multimodal agents,
particularly for vision-language tasks.
"""

from __future__ import annotations

import base64
from io import BytesIO
from pathlib import Path
from typing import Any, Union

import requests
from PIL import Image
from PIL.Image import Image as PILImage

__all__ = [
    "encode_image_to_base64",
    "create_image_message",
]


def encode_image_to_base64(image: Union[str, Path, PILImage], max_size: int = 2048) -> str:
    """
    Encode an image to base64 string for multimodal LLM APIs.

    Args:
        image: Image source (file path, URL, or PIL Image object)
        max_size: Maximum dimension for resizing

    Returns:
        Base64 encoded image string with data URI prefix

    Raises:
        ImportError: If PIL (Pillow) is not installed
        TypeError: If image type is not supported

    Examples:
        >>> encoded = encode_image_to_base64("photo.jpg")
        >>> encoded[:30]
        'data:image/jpeg;base64,/9j/4A...'

        >>> from PIL import Image
        >>> img = Image.open("photo.jpg")
        >>> encoded = encode_image_to_base64(img)
    """
    # Load image
    if isinstance(image, (str, Path)):
        image_str = str(image)
        if image_str.startswith(("http://", "https://")):
            response = requests.get(image_str, timeout=30)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content))
        else:
            img = Image.open(image_str)
    elif hasattr(image, "mode"):
        # PIL Image object
        img = image
    else:
        raise TypeError(f"Unsupported image type: {type(image)}")

    # Convert to RGB
    if img.mode == "RGBA":
        background = Image.new("RGB", img.size, (255, 255, 255))
        background.paste(img, mask=img.split()[3])
        img = background
    elif img.mode != "RGB":
        img = img.convert("RGB")

    # Resize if needed
    if max(img.size) > max_size:
        img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)

    # Encode
    buffered = BytesIO()
    img.save(buffered, format="JPEG", quality=85)
    img_str = base64.b64encode(buffered.getvalue()).decode()

    return f"data:image/jpeg;base64,{img_str}"


def create_image_message(text: str, image: Union[str, Path, PILImage], use_base64: bool = True) -> dict[str, Any]:
    """
    Create an OpenAI-compatible multimodal message.

    Args:
        text: The text prompt/question
        image: Image source (path, URL, or PIL Image)
        use_base64: If True, encode as base64; if False, use URL directly

    Returns:
        Message dict with role="user" and multimodal content

    Examples:
        >>> msg = create_image_message("What's in the image?", "photo.jpg")
        >>> msg["role"]
        'user'
        >>> len(msg["content"])
        2
    """
    content: list[dict[str, Any]] = [{"type": "text", "text": text}]

    if isinstance(image, str) and image.startswith(("http://", "https://")) and not use_base64:
        content.append({"type": "image_url", "image_url": {"url": image}})
    else:
        encoded = encode_image_to_base64(image)
        content.append({"type": "image_url", "image_url": {"url": encoded}})

    return {"role": "user", "content": content}