Skip to content

Commit 1484e78

Browse files
author
Lloyd Hamilton
committed
Merge branch 'main' into fix/added_bedrock_streaming
2 parents cf923be + 13021df commit 1484e78

File tree

16 files changed

+2905
-1787
lines changed

16 files changed

+2905
-1787
lines changed

.github/workflows/python-test.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
python-version: ['3.9', '3.10', '3.11', '3.12']
1515

1616
steps:
17-
- uses: actions/checkout@v3 # Updated to the latest version
17+
- uses: actions/checkout@v4 # Updated to the latest version
1818
- name: Set up Python ${{ matrix.python-version }}
1919
uses: actions/setup-python@v4 # Updated to the latest version
2020
with:
@@ -37,7 +37,7 @@ jobs:
3737
poetry run pytest
3838
3939
- name: Upload pytest results as an artifact (optional)
40-
uses: actions/upload-artifact@v3 # Updated to the latest version
40+
uses: actions/upload-artifact@v4 # Updated to the latest version
4141
if: always() # Always run this step to ensure test results are saved even if previous steps fail
4242
with:
4343
name: pytest-results

adalflow/CHANGELOG.md

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,23 @@
1-
## [0.2.7] - 2024-09-23
1+
## [0.2.7] - 2025-01-16
22

3-
### Improved
4-
- Better diagnose report for `Trainer.diagnose`.
5-
- Multi-hop RAG with handling of Cycle.
6-
7-
## [0.2.7] - TO Be Released
83
### Added
94
- `Memory` is completed with `call` and `add_dialog_turn` methods.
105
- Integrated `LanceDB` in the `Retriever`
6+
- Multi-modal (image input and generation) in `OpenAIClient` along with tests.
7+
- `ComponentList` to support a list of components registered in a component. Added `test_componentlist` to test the `ComponentList`.
8+
119
### Improved
10+
- Better diagnose report for `Trainer.diagnose`.
1211
- `BedrockAPIClient` added more details on setup, yet it is still in experimental stage.
1312
- `AzureAPIClient` added more details on setup, yet it is still in experimental stage.
13+
- `Retriever` class:
14+
- Support data id (field).
15+
- `GradComponent`: Support pass-through gradient for the `forward` method.
16+
17+
Optimization
18+
- Aggregated all backward engine prompts in `backward_engine_prompt`.
19+
- Added `TGDData` for the optimizer to support reasoning at proposing new prompt.
20+
- Added `sequential_order` in the `Trainer` to support the sequential training order. Reorganized the trainer code.
1421
## [0.2.6] - 2024-11-25
1522
### Improved
1623
- Add default `max_tokens=512` to the `AnthropicAPIClient` to avoid the error when the user does not provide the `max_tokens` in the prompt.

adalflow/adalflow/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.2.6"
1+
__version__ = "0.2.7"
22

33
from adalflow.core.component import Component, fun_to_component
44
from adalflow.core.container import Sequential, ComponentList

adalflow/adalflow/components/model_client/openai_client.py

Lines changed: 175 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""OpenAI ModelClient integration."""
22

33
import os
4+
import base64
45
from typing import (
56
Dict,
67
Sequence,
@@ -35,6 +36,7 @@
3536
from openai.types import (
3637
Completion,
3738
CreateEmbeddingResponse,
39+
Image,
3840
)
3941
from openai.types.chat import ChatCompletionChunk, ChatCompletion
4042

@@ -99,7 +101,7 @@ def get_probabilities(completion: ChatCompletion) -> List[List[TokenLogProb]]:
99101
class OpenAIClient(ModelClient):
100102
__doc__ = r"""A component wrapper for the OpenAI API client.
101103
102-
Support both embedding and chat completion API.
104+
Support both embedding and chat completion API, including multimodal capabilities.
103105
104106
Users (1) simplify use ``Embedder`` and ``Generator`` components by passing OpenAIClient() as the model_client.
105107
(2) can use this as an example to create their own API client or extend this class(copying and modifing the code) in their own project.
@@ -110,6 +112,17 @@ class OpenAIClient(ModelClient):
110112
Instead
111113
- use :ref:`OutputParser<components-output_parsers>` for response parsing and formating.
112114
115+
For multimodal inputs, provide images in model_kwargs["images"] as a path, URL, or list of them.
116+
The model must support vision capabilities (e.g., gpt-4o, gpt-4o-mini, o1, o1-mini).
117+
118+
For image generation, use model_type=ModelType.IMAGE_GENERATION and provide:
119+
- model: "dall-e-3" or "dall-e-2"
120+
- prompt: Text description of the image to generate
121+
- size: "1024x1024", "1024x1792", or "1792x1024" for DALL-E 3; "256x256", "512x512", or "1024x1024" for DALL-E 2
122+
- quality: "standard" or "hd" (DALL-E 3 only)
123+
- n: Number of images to generate (1 for DALL-E 3, 1-10 for DALL-E 2)
124+
- response_format: "url" or "b64_json"
125+
113126
Args:
114127
api_key (Optional[str], optional): OpenAI API key. Defaults to None.
115128
chat_completion_parser (Callable[[Completion], Any], optional): A function to parse the chat completion to a str. Defaults to None.
@@ -118,6 +131,8 @@ class OpenAIClient(ModelClient):
118131
References:
119132
- Embeddings models: https://platform.openai.com/docs/guides/embeddings
120133
- Chat models: https://platform.openai.com/docs/guides/text-generation
134+
- Vision models: https://platform.openai.com/docs/guides/vision
135+
- Image models: https://platform.openai.com/docs/guides/images
121136
- OpenAI docs: https://platform.openai.com/docs/introduction
122137
"""
123138

@@ -200,7 +215,7 @@ def track_completion_usage(
200215
def parse_embedding_response(
201216
self, response: CreateEmbeddingResponse
202217
) -> EmbedderOutput:
203-
r"""Parse the embedding response to a structure LightRAG components can understand.
218+
r"""Parse the embedding response to a structure Adalflow components can understand.
204219
205220
Should be called in ``Embedder``.
206221
"""
@@ -218,7 +233,20 @@ def convert_inputs_to_api_kwargs(
218233
) -> Dict:
219234
r"""
220235
Specify the API input type and output api_kwargs that will be used in _call and _acall methods.
221-
Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format
236+
Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format.
237+
For multimodal inputs, images can be provided in model_kwargs["images"] as a string path, URL, or list of them.
238+
The model specified in model_kwargs["model"] must support multimodal capabilities when using images.
239+
240+
Args:
241+
input: The input text or messages to process
242+
model_kwargs: Additional parameters including:
243+
- images: Optional image source(s) as path, URL, or list of them
244+
- detail: Image detail level ('auto', 'low', or 'high'), defaults to 'auto'
245+
- model: The model to use (must support multimodal inputs if images are provided)
246+
model_type: The type of model (EMBEDDER or LLM)
247+
248+
Returns:
249+
Dict: API-specific kwargs for the model call
222250
"""
223251

224252
final_model_kwargs = model_kwargs.copy()
@@ -232,6 +260,8 @@ def convert_inputs_to_api_kwargs(
232260
elif model_type == ModelType.LLM:
233261
# convert input to messages
234262
messages: List[Dict[str, str]] = []
263+
images = final_model_kwargs.pop("images", None)
264+
detail = final_model_kwargs.pop("detail", "auto")
235265

236266
if self._input_type == "messages":
237267
system_start_tag = "<START_OF_SYSTEM_PROMPT>"
@@ -248,19 +278,74 @@ def convert_inputs_to_api_kwargs(
248278
if match:
249279
system_prompt = match.group(1)
250280
input_str = match.group(2)
251-
252281
else:
253282
print("No match found.")
254283
if system_prompt and input_str:
255284
messages.append({"role": "system", "content": system_prompt})
256-
messages.append({"role": "user", "content": input_str})
285+
if images:
286+
content = [{"type": "text", "text": input_str}]
287+
if isinstance(images, (str, dict)):
288+
images = [images]
289+
for img in images:
290+
content.append(self._prepare_image_content(img, detail))
291+
messages.append({"role": "user", "content": content})
292+
else:
293+
messages.append({"role": "user", "content": input_str})
257294
if len(messages) == 0:
258-
messages.append({"role": "system", "content": input})
295+
if images:
296+
content = [{"type": "text", "text": input}]
297+
if isinstance(images, (str, dict)):
298+
images = [images]
299+
for img in images:
300+
content.append(self._prepare_image_content(img, detail))
301+
messages.append({"role": "user", "content": content})
302+
else:
303+
messages.append({"role": "system", "content": input})
259304
final_model_kwargs["messages"] = messages
305+
elif model_type == ModelType.IMAGE_GENERATION:
306+
# For image generation, input is the prompt
307+
final_model_kwargs["prompt"] = input
308+
# Ensure model is specified
309+
if "model" not in final_model_kwargs:
310+
raise ValueError("model must be specified for image generation")
311+
# Set defaults for DALL-E 3 if not specified
312+
final_model_kwargs["size"] = final_model_kwargs.get("size", "1024x1024")
313+
final_model_kwargs["quality"] = final_model_kwargs.get(
314+
"quality", "standard"
315+
)
316+
final_model_kwargs["n"] = final_model_kwargs.get("n", 1)
317+
final_model_kwargs["response_format"] = final_model_kwargs.get(
318+
"response_format", "url"
319+
)
320+
321+
# Handle image edits and variations
322+
image = final_model_kwargs.get("image")
323+
if isinstance(image, str) and os.path.isfile(image):
324+
final_model_kwargs["image"] = self._encode_image(image)
325+
326+
mask = final_model_kwargs.get("mask")
327+
if isinstance(mask, str) and os.path.isfile(mask):
328+
final_model_kwargs["mask"] = self._encode_image(mask)
260329
else:
261330
raise ValueError(f"model_type {model_type} is not supported")
262331
return final_model_kwargs
263332

333+
def parse_image_generation_response(self, response: List[Image]) -> GeneratorOutput:
334+
"""Parse the image generation response into a GeneratorOutput."""
335+
try:
336+
# Extract URLs or base64 data from the response
337+
data = [img.url or img.b64_json for img in response]
338+
# For single image responses, unwrap from list
339+
if len(data) == 1:
340+
data = data[0]
341+
return GeneratorOutput(
342+
data=data,
343+
raw_response=str(response),
344+
)
345+
except Exception as e:
346+
log.error(f"Error parsing image generation response: {e}")
347+
return GeneratorOutput(data=None, error=str(e), raw_response=str(response))
348+
264349
@backoff.on_exception(
265350
backoff.expo,
266351
(
@@ -285,6 +370,19 @@ def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINE
285370
self.chat_completion_parser = handle_streaming_response
286371
return self.sync_client.chat.completions.create(**api_kwargs)
287372
return self.sync_client.chat.completions.create(**api_kwargs)
373+
elif model_type == ModelType.IMAGE_GENERATION:
374+
# Determine which image API to call based on the presence of image/mask
375+
if "image" in api_kwargs:
376+
if "mask" in api_kwargs:
377+
# Image edit
378+
response = self.sync_client.images.edit(**api_kwargs)
379+
else:
380+
# Image variation
381+
response = self.sync_client.images.create_variation(**api_kwargs)
382+
else:
383+
# Image generation
384+
response = self.sync_client.images.generate(**api_kwargs)
385+
return response.data
288386
else:
289387
raise ValueError(f"model_type {model_type} is not supported")
290388

@@ -311,6 +409,21 @@ async def acall(
311409
return await self.async_client.embeddings.create(**api_kwargs)
312410
elif model_type == ModelType.LLM:
313411
return await self.async_client.chat.completions.create(**api_kwargs)
412+
elif model_type == ModelType.IMAGE_GENERATION:
413+
# Determine which image API to call based on the presence of image/mask
414+
if "image" in api_kwargs:
415+
if "mask" in api_kwargs:
416+
# Image edit
417+
response = await self.async_client.images.edit(**api_kwargs)
418+
else:
419+
# Image variation
420+
response = await self.async_client.images.create_variation(
421+
**api_kwargs
422+
)
423+
else:
424+
# Image generation
425+
response = await self.async_client.images.generate(**api_kwargs)
426+
return response.data
314427
else:
315428
raise ValueError(f"model_type {model_type} is not supported")
316429

@@ -332,22 +445,74 @@ def to_dict(self) -> Dict[str, Any]:
332445
output = super().to_dict(exclude=exclude)
333446
return output
334447

448+
def _encode_image(self, image_path: str) -> str:
449+
"""Encode image to base64 string.
450+
451+
Args:
452+
image_path: Path to image file.
453+
454+
Returns:
455+
Base64 encoded image string.
456+
457+
Raises:
458+
ValueError: If the file cannot be read or doesn't exist.
459+
"""
460+
try:
461+
with open(image_path, "rb") as image_file:
462+
return base64.b64encode(image_file.read()).decode("utf-8")
463+
except FileNotFoundError:
464+
raise ValueError(f"Image file not found: {image_path}")
465+
except PermissionError:
466+
raise ValueError(f"Permission denied when reading image file: {image_path}")
467+
except Exception as e:
468+
raise ValueError(f"Error encoding image {image_path}: {str(e)}")
469+
470+
def _prepare_image_content(
471+
self, image_source: Union[str, Dict[str, Any]], detail: str = "auto"
472+
) -> Dict[str, Any]:
473+
"""Prepare image content for API request.
474+
475+
Args:
476+
image_source: Either a path to local image or a URL.
477+
detail: Image detail level ('auto', 'low', or 'high').
335478
479+
Returns:
480+
Formatted image content for API request.
481+
"""
482+
if isinstance(image_source, str):
483+
if image_source.startswith(("http://", "https://")):
484+
return {
485+
"type": "image_url",
486+
"image_url": {"url": image_source, "detail": detail},
487+
}
488+
else:
489+
base64_image = self._encode_image(image_source)
490+
return {
491+
"type": "image_url",
492+
"image_url": {
493+
"url": f"data:image/jpeg;base64,{base64_image}",
494+
"detail": detail,
495+
},
496+
}
497+
return image_source
498+
499+
500+
# Example usage:
336501
# if __name__ == "__main__":
337502
# from adalflow.core import Generator
338503
# from adalflow.utils import setup_env, get_logger
339-
504+
#
340505
# log = get_logger(level="DEBUG")
341-
506+
#
342507
# setup_env()
343508
# prompt_kwargs = {"input_str": "What is the meaning of life?"}
344-
509+
#
345510
# gen = Generator(
346511
# model_client=OpenAIClient(),
347512
# model_kwargs={"model": "gpt-3.5-turbo", "stream": True},
348513
# )
349514
# gen_response = gen(prompt_kwargs)
350515
# print(f"gen_response: {gen_response}")
351-
516+
#
352517
# for genout in gen_response.data:
353518
# print(f"genout: {genout}")

adalflow/adalflow/core/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ class ModelType(Enum):
5858
EMBEDDER = auto()
5959
LLM = auto()
6060
RERANKER = auto() # ranking model
61+
IMAGE_GENERATION = auto() # image generation models like DALL-E
6162
UNDEFINED = auto()
6263

6364

0 commit comments

Comments
 (0)