-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Expand file tree
/
Copy pathmultimodal_utils.py
More file actions
113 lines (89 loc) · 3.3 KB
/
multimodal_utils.py
File metadata and controls
113 lines (89 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Copyright (c) Microsoft. All rights reserved.
"""
Multimodal support utilities for Agent Lightning.
This module provides helper functions for working with multimodal agents,
particularly for vision-language tasks.
"""
from __future__ import annotations
import base64
from io import BytesIO
from pathlib import Path
from typing import Any, Union
import requests
from PIL import Image
from PIL.Image import Image as PILImage
__all__ = [
"encode_image_to_base64",
"create_image_message",
]
def encode_image_to_base64(image: Union[str, Path, PILImage], max_size: int = 2048) -> str:
"""
Encode an image to base64 string for multimodal LLM APIs.
Args:
image: Image source (file path, URL, or PIL Image object)
max_size: Maximum dimension for resizing
Returns:
Base64 encoded image string with data URI prefix
Raises:
ImportError: If PIL (Pillow) is not installed
TypeError: If image type is not supported
Examples:
>>> encoded = encode_image_to_base64("photo.jpg")
>>> encoded[:30]
'data:image/jpeg;base64,/9j/4A...'
>>> from PIL import Image
>>> img = Image.open("photo.jpg")
>>> encoded = encode_image_to_base64(img)
"""
# Load image
if isinstance(image, (str, Path)):
image_str = str(image)
if image_str.startswith(("http://", "https://")):
response = requests.get(image_str, timeout=30)
response.raise_for_status()
img = Image.open(BytesIO(response.content))
else:
img = Image.open(image_str)
elif hasattr(image, "mode"):
# PIL Image object
img = image
else:
raise TypeError(f"Unsupported image type: {type(image)}")
# Convert to RGB
if img.mode == "RGBA":
background = Image.new("RGB", img.size, (255, 255, 255))
background.paste(img, mask=img.split()[3])
img = background
elif img.mode != "RGB":
img = img.convert("RGB")
# Resize if needed
if max(img.size) > max_size:
img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
# Encode
buffered = BytesIO()
img.save(buffered, format="JPEG", quality=85)
img_str = base64.b64encode(buffered.getvalue()).decode()
return f"data:image/jpeg;base64,{img_str}"
def create_image_message(text: str, image: Union[str, Path, PILImage], use_base64: bool = True) -> dict[str, Any]:
"""
Create an OpenAI-compatible multimodal message.
Args:
text: The text prompt/question
image: Image source (path, URL, or PIL Image)
use_base64: If True, encode as base64; if False, use URL directly
Returns:
Message dict with role="user" and multimodal content
Examples:
>>> msg = create_image_message("What's in the image?", "photo.jpg")
>>> msg["role"]
'user'
>>> len(msg["content"])
2
"""
content: list[dict[str, Any]] = [{"type": "text", "text": text}]
if isinstance(image, str) and image.startswith(("http://", "https://")) and not use_base64:
content.append({"type": "image_url", "image_url": {"url": image}})
else:
encoded = encode_image_to_base64(image)
content.append({"type": "image_url", "image_url": {"url": encoded}})
return {"role": "user", "content": content}