Skip to content

Commit 00ea1d5

Browse files
committed
Single function openaiclient and test
1 parent b0a473b commit 00ea1d5

File tree

5 files changed

+198
-256
lines changed

5 files changed

+198
-256
lines changed

adalflow/adalflow/components/model_client/openai_client.py

Lines changed: 57 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -52,14 +52,6 @@
5252
log = logging.getLogger(__name__)
5353
T = TypeVar("T")
5454

55-
# Models that support multimodal inputs
56-
MULTIMODAL_MODELS = {
57-
"gpt-4o", # Versatile, high-intelligence flagship model
58-
"gpt-4o-mini", # Fast, affordable small model for focused tasks
59-
"o1", # Reasoning model that excels at complex, multi-step tasks
60-
"o1-mini", # Smaller reasoning model for complex tasks
61-
}
62-
6355

6456
# completion parsing functions and you can combine them into one singple chat completion parser
6557
def get_first_message_content(completion: ChatCompletion) -> str:
@@ -108,7 +100,7 @@ def get_probabilities(completion: ChatCompletion) -> List[List[TokenLogProb]]:
108100
class OpenAIClient(ModelClient):
109101
__doc__ = r"""A component wrapper for the OpenAI API client.
110102
111-
Support both embedding and chat completion API.
103+
Support both embedding and chat completion API, including multimodal capabilities.
112104
113105
Users (1) simplify use ``Embedder`` and ``Generator`` components by passing OpenAIClient() as the model_client.
114106
(2) can use this as an example to create their own API client or extend this class(copying and modifing the code) in their own project.
@@ -119,6 +111,9 @@ class OpenAIClient(ModelClient):
119111
Instead
120112
- use :ref:`OutputParser<components-output_parsers>` for response parsing and formating.
121113
114+
For multimodal inputs, provide images in model_kwargs["images"] as a path, URL, or list of them.
115+
The model must support vision capabilities (e.g., gpt-4o, gpt-4o-mini, o1, o1-mini).
116+
122117
Args:
123118
api_key (Optional[str], optional): OpenAI API key. Defaults to None.
124119
chat_completion_parser (Callable[[Completion], Any], optional): A function to parse the chat completion to a str. Defaults to None.
@@ -127,6 +122,7 @@ class OpenAIClient(ModelClient):
127122
References:
128123
- Embeddings models: https://platform.openai.com/docs/guides/embeddings
129124
- Chat models: https://platform.openai.com/docs/guides/text-generation
125+
- Vision models: https://platform.openai.com/docs/guides/vision
130126
- OpenAI docs: https://platform.openai.com/docs/introduction
131127
"""
132128

@@ -209,7 +205,7 @@ def track_completion_usage(
209205
def parse_embedding_response(
210206
self, response: CreateEmbeddingResponse
211207
) -> EmbedderOutput:
212-
r"""Parse the embedding response to a structure LightRAG components can understand.
208+
r"""Parse the embedding response to a structure Adalflow components can understand.
213209
214210
Should be called in ``Embedder``.
215211
"""
@@ -227,7 +223,20 @@ def convert_inputs_to_api_kwargs(
227223
) -> Dict:
228224
r"""
229225
Specify the API input type and output api_kwargs that will be used in _call and _acall methods.
230-
Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format
226+
Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format.
227+
For multimodal inputs, images can be provided in model_kwargs["images"] as a string path, URL, or list of them.
228+
The model specified in model_kwargs["model"] must support multimodal capabilities when using images.
229+
230+
Args:
231+
input: The input text or messages to process
232+
model_kwargs: Additional parameters including:
233+
- images: Optional image source(s) as path, URL, or list of them
234+
- detail: Image detail level ('auto', 'low', or 'high'), defaults to 'auto'
235+
- model: The model to use (must support multimodal inputs if images are provided)
236+
model_type: The type of model (EMBEDDER or LLM)
237+
238+
Returns:
239+
Dict: API-specific kwargs for the model call
231240
"""
232241

233242
final_model_kwargs = model_kwargs.copy()
@@ -241,6 +250,8 @@ def convert_inputs_to_api_kwargs(
241250
elif model_type == ModelType.LLM:
242251
# convert input to messages
243252
messages: List[Dict[str, str]] = []
253+
images = final_model_kwargs.pop("images", None)
254+
detail = final_model_kwargs.pop("detail", "auto")
244255

245256
if self._input_type == "messages":
246257
system_start_tag = "<START_OF_SYSTEM_PROMPT>"
@@ -257,14 +268,29 @@ def convert_inputs_to_api_kwargs(
257268
if match:
258269
system_prompt = match.group(1)
259270
input_str = match.group(2)
260-
261271
else:
262272
print("No match found.")
263273
if system_prompt and input_str:
264274
messages.append({"role": "system", "content": system_prompt})
265-
messages.append({"role": "user", "content": input_str})
275+
if images:
276+
content = [{"type": "text", "text": input_str}]
277+
if isinstance(images, (str, dict)):
278+
images = [images]
279+
for img in images:
280+
content.append(self._prepare_image_content(img, detail))
281+
messages.append({"role": "user", "content": content})
282+
else:
283+
messages.append({"role": "user", "content": input_str})
266284
if len(messages) == 0:
267-
messages.append({"role": "system", "content": input})
285+
if images:
286+
content = [{"type": "text", "text": input}]
287+
if isinstance(images, (str, dict)):
288+
images = [images]
289+
for img in images:
290+
content.append(self._prepare_image_content(img, detail))
291+
messages.append({"role": "user", "content": content})
292+
else:
293+
messages.append({"role": "system", "content": input})
268294
final_model_kwargs["messages"] = messages
269295
else:
270296
raise ValueError(f"model_type {model_type} is not supported")
@@ -349,9 +375,19 @@ def _encode_image(self, image_path: str) -> str:
349375
350376
Returns:
351377
Base64 encoded image string.
378+
379+
Raises:
380+
ValueError: If the file cannot be read or doesn't exist.
352381
"""
353-
with open(image_path, "rb") as image_file:
354-
return base64.b64encode(image_file.read()).decode("utf-8")
382+
try:
383+
with open(image_path, "rb") as image_file:
384+
return base64.b64encode(image_file.read()).decode("utf-8")
385+
except FileNotFoundError:
386+
raise ValueError(f"Image file not found: {image_path}")
387+
except PermissionError:
388+
raise ValueError(f"Permission denied when reading image file: {image_path}")
389+
except Exception as e:
390+
raise ValueError(f"Error encoding image {image_path}: {str(e)}")
355391

356392
def _prepare_image_content(
357393
self, image_source: Union[str, Dict[str, Any]], detail: str = "auto"
@@ -382,77 +418,23 @@ def _prepare_image_content(
382418
}
383419
return image_source
384420

385-
def generate(
386-
self,
387-
prompt: str,
388-
images: Optional[
389-
Union[str, List[str], Dict[str, Any], List[Dict[str, Any]]]
390-
] = None,
391-
model_kwargs: Optional[Dict[str, Any]] = None,
392-
) -> GeneratorOutput:
393-
"""Generate text response for given prompt and optionally images.
394-
395-
Args:
396-
prompt: Text prompt.
397-
images: Optional image source(s) - can be path(s), URL(s), or formatted dict(s).
398-
model_kwargs: Additional model parameters.
399-
400-
Returns:
401-
GeneratorOutput containing the model's response.
402-
"""
403-
model_kwargs = model_kwargs or {}
404-
model = model_kwargs.get("model", "gpt-4o-mini")
405-
max_tokens = model_kwargs.get("max_tokens", 300)
406-
detail = model_kwargs.get("detail", "auto")
407-
408-
# Check if model supports multimodal inputs when images are provided
409-
if images and model not in MULTIMODAL_MODELS:
410-
return GeneratorOutput(
411-
error=f"Model {model} does not support multimodal inputs. Supported models: {MULTIMODAL_MODELS}"
412-
)
413-
414-
# Prepare message content
415-
if images:
416-
content = [{"type": "text", "text": prompt}]
417-
if not isinstance(images, list):
418-
images = [images]
419-
for img in images:
420-
content.append(self._prepare_image_content(img, detail))
421-
messages = [{"role": "user", "content": content}]
422-
else:
423-
messages = [{"role": "user", "content": prompt}]
424-
425-
try:
426-
response = self.client.chat.completions.create(
427-
model=model,
428-
messages=messages,
429-
max_tokens=max_tokens,
430-
)
431-
return GeneratorOutput(
432-
id=response.id,
433-
data=response.choices[0].message.content,
434-
usage=response.usage.model_dump() if response.usage else None,
435-
raw_response=response.model_dump(),
436-
)
437-
except Exception as e:
438-
return GeneratorOutput(error=str(e))
439-
440421

422+
# Example usage:
441423
# if __name__ == "__main__":
442424
# from adalflow.core import Generator
443425
# from adalflow.utils import setup_env, get_logger
444-
426+
#
445427
# log = get_logger(level="DEBUG")
446-
428+
#
447429
# setup_env()
448430
# prompt_kwargs = {"input_str": "What is the meaning of life?"}
449-
431+
#
450432
# gen = Generator(
451433
# model_client=OpenAIClient(),
452434
# model_kwargs={"model": "gpt-3.5-turbo", "stream": True},
453435
# )
454436
# gen_response = gen(prompt_kwargs)
455437
# print(f"gen_response: {gen_response}")
456-
438+
#
457439
# for genout in gen_response.data:
458440
# print(f"genout: {genout}")

adalflow/adalflow/utils/lazy_import.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -215,13 +215,3 @@ def safe_import(
215215
raise ImportError(f"{install_message}")
216216

217217
return return_modules[0] if len(return_modules) == 1 else return_modules
218-
219-
220-
OPTIONAL_PACKAGES = {
221-
"openai": "openai", # For OpenAI API clients
222-
"transformers": "transformers", # For local models
223-
"torch": "torch", # For PyTorch models
224-
"anthropic": "anthropic", # For Claude models
225-
"groq": "groq", # For Groq models
226-
"cohere": "cohere", # For Cohere models
227-
}

adalflow/tests/test_openai_client.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import unittest
22
from unittest.mock import patch, AsyncMock, Mock
3+
import os
4+
import base64
35

46
from openai.types import CompletionUsage
57
from openai.types.chat import ChatCompletion
@@ -42,6 +44,105 @@ def setUp(self):
4244
"model": "gpt-3.5-turbo",
4345
}
4446

47+
def test_encode_image(self):
48+
# Create a temporary test image file
49+
test_image_path = "test_image.jpg"
50+
test_content = b"fake image content"
51+
try:
52+
with open(test_image_path, "wb") as f:
53+
f.write(test_content)
54+
55+
# Test successful encoding
56+
encoded = self.client._encode_image(test_image_path)
57+
self.assertEqual(encoded, base64.b64encode(test_content).decode("utf-8"))
58+
59+
# Test file not found
60+
with self.assertRaises(ValueError) as context:
61+
self.client._encode_image("nonexistent.jpg")
62+
self.assertIn("Image file not found", str(context.exception))
63+
64+
finally:
65+
# Cleanup
66+
if os.path.exists(test_image_path):
67+
os.remove(test_image_path)
68+
69+
def test_prepare_image_content(self):
70+
# Test URL image
71+
url = "https://example.com/image.jpg"
72+
result = self.client._prepare_image_content(url)
73+
self.assertEqual(
74+
result,
75+
{"type": "image_url", "image_url": {"url": url, "detail": "auto"}},
76+
)
77+
78+
# Test with custom detail level
79+
result = self.client._prepare_image_content(url, detail="high")
80+
self.assertEqual(
81+
result,
82+
{"type": "image_url", "image_url": {"url": url, "detail": "high"}},
83+
)
84+
85+
# Test with pre-formatted content
86+
pre_formatted = {
87+
"type": "image_url",
88+
"image_url": {"url": url, "detail": "low"},
89+
}
90+
result = self.client._prepare_image_content(pre_formatted)
91+
self.assertEqual(result, pre_formatted)
92+
93+
def test_convert_inputs_to_api_kwargs_with_images(self):
94+
# Test with single image URL
95+
model_kwargs = {
96+
"model": "gpt-4-vision-preview",
97+
"images": "https://example.com/image.jpg",
98+
}
99+
result = self.client.convert_inputs_to_api_kwargs(
100+
input="Describe this image",
101+
model_kwargs=model_kwargs,
102+
model_type=ModelType.LLM,
103+
)
104+
expected_content = [
105+
{"type": "text", "text": "Describe this image"},
106+
{
107+
"type": "image_url",
108+
"image_url": {"url": "https://example.com/image.jpg", "detail": "auto"},
109+
},
110+
]
111+
self.assertEqual(result["messages"][0]["content"], expected_content)
112+
113+
# Test with multiple images
114+
model_kwargs = {
115+
"model": "gpt-4-vision-preview",
116+
"images": [
117+
"https://example.com/image1.jpg",
118+
"https://example.com/image2.jpg",
119+
],
120+
"detail": "high",
121+
}
122+
result = self.client.convert_inputs_to_api_kwargs(
123+
input="Compare these images",
124+
model_kwargs=model_kwargs,
125+
model_type=ModelType.LLM,
126+
)
127+
expected_content = [
128+
{"type": "text", "text": "Compare these images"},
129+
{
130+
"type": "image_url",
131+
"image_url": {
132+
"url": "https://example.com/image1.jpg",
133+
"detail": "high",
134+
},
135+
},
136+
{
137+
"type": "image_url",
138+
"image_url": {
139+
"url": "https://example.com/image2.jpg",
140+
"detail": "high",
141+
},
142+
},
143+
]
144+
self.assertEqual(result["messages"][0]["content"], expected_content)
145+
45146
@patch("adalflow.components.model_client.openai_client.AsyncOpenAI")
46147
async def test_acall_llm(self, MockAsyncOpenAI):
47148
mock_async_client = AsyncMock()

docs/source/tutorials/model_client.rst

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,6 +1513,46 @@ This is the function call that triggers the execution of the custom model client
15131513
15141514
build_custom_model_client()
15151515
1516+
1517+
OPENAI LLM Chat - Multimodal Example
1518+
-------------------------------------------------
1519+
1520+
The OpenAI client also supports multimodal inputs. Here's a quick example:
1521+
1522+
.. code-block:: python
1523+
1524+
from adalflow import Generator, OpenAIClient
1525+
1526+
generator = Generator(
1527+
model_client=OpenAIClient(),
1528+
model_kwargs={
1529+
"model": "gpt-4o",
1530+
"max_tokens": 300
1531+
}
1532+
)
1533+
1534+
# Single image
1535+
response = generator(
1536+
prompt_kwargs={
1537+
"input_str": "What's in this image?",
1538+
"images": "path/to/image.jpg" # Local file or URL
1539+
}
1540+
)
1541+
1542+
# Multiple images
1543+
response = generator(
1544+
prompt_kwargs={
1545+
"input_str": "Compare these images.",
1546+
"images": [
1547+
"path/to/first.jpg",
1548+
"https://example.com/second.jpg"
1549+
]
1550+
}
1551+
)
1552+
1553+
The client handles both local files and URLs, with support for PNG, JPEG, WEBP, and non-animated GIF formats.
1554+
1555+
15161556
.. admonition:: API reference
15171557
:class: highlight
15181558

0 commit comments

Comments
 (0)