11"""OpenAI ModelClient integration."""
22
33import os
4+ import base64
45from typing import (
56 Dict ,
67 Sequence ,
3536from openai .types import (
3637 Completion ,
3738 CreateEmbeddingResponse ,
39+ Image ,
3840)
3941from openai .types .chat import ChatCompletionChunk , ChatCompletion
4042
@@ -99,7 +101,7 @@ def get_probabilities(completion: ChatCompletion) -> List[List[TokenLogProb]]:
99101class OpenAIClient (ModelClient ):
100102 __doc__ = r"""A component wrapper for the OpenAI API client.
101103
102- Support both embedding and chat completion API.
104+ Support both embedding and chat completion API, including multimodal capabilities .
103105
104106 Users (1) simplify use ``Embedder`` and ``Generator`` components by passing OpenAIClient() as the model_client.
105107 (2) can use this as an example to create their own API client or extend this class(copying and modifing the code) in their own project.
@@ -110,6 +112,17 @@ class OpenAIClient(ModelClient):
110112 Instead
111113 - use :ref:`OutputParser<components-output_parsers>` for response parsing and formating.
112114
115+ For multimodal inputs, provide images in model_kwargs["images"] as a path, URL, or list of them.
116+ The model must support vision capabilities (e.g., gpt-4o, gpt-4o-mini, o1, o1-mini).
117+
118+ For image generation, use model_type=ModelType.IMAGE_GENERATION and provide:
119+ - model: "dall-e-3" or "dall-e-2"
120+ - prompt: Text description of the image to generate
121+ - size: "1024x1024", "1024x1792", or "1792x1024" for DALL-E 3; "256x256", "512x512", or "1024x1024" for DALL-E 2
122+ - quality: "standard" or "hd" (DALL-E 3 only)
123+ - n: Number of images to generate (1 for DALL-E 3, 1-10 for DALL-E 2)
124+ - response_format: "url" or "b64_json"
125+
113126 Args:
114127 api_key (Optional[str], optional): OpenAI API key. Defaults to None.
115128 chat_completion_parser (Callable[[Completion], Any], optional): A function to parse the chat completion to a str. Defaults to None.
@@ -118,6 +131,8 @@ class OpenAIClient(ModelClient):
118131 References:
119132 - Embeddings models: https://platform.openai.com/docs/guides/embeddings
120133 - Chat models: https://platform.openai.com/docs/guides/text-generation
134+ - Vision models: https://platform.openai.com/docs/guides/vision
135+ - Image models: https://platform.openai.com/docs/guides/images
121136 - OpenAI docs: https://platform.openai.com/docs/introduction
122137 """
123138
@@ -200,7 +215,7 @@ def track_completion_usage(
200215 def parse_embedding_response (
201216 self , response : CreateEmbeddingResponse
202217 ) -> EmbedderOutput :
203- r"""Parse the embedding response to a structure LightRAG components can understand.
218+ r"""Parse the embedding response to a structure Adalflow components can understand.
204219
205220 Should be called in ``Embedder``.
206221 """
@@ -218,7 +233,20 @@ def convert_inputs_to_api_kwargs(
218233 ) -> Dict :
219234 r"""
220235 Specify the API input type and output api_kwargs that will be used in _call and _acall methods.
221- Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format
236+ Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format.
237+ For multimodal inputs, images can be provided in model_kwargs["images"] as a string path, URL, or list of them.
238+ The model specified in model_kwargs["model"] must support multimodal capabilities when using images.
239+
240+ Args:
241+ input: The input text or messages to process
242+ model_kwargs: Additional parameters including:
243+ - images: Optional image source(s) as path, URL, or list of them
244+ - detail: Image detail level ('auto', 'low', or 'high'), defaults to 'auto'
245+ - model: The model to use (must support multimodal inputs if images are provided)
246+ model_type: The type of model (EMBEDDER or LLM)
247+
248+ Returns:
249+ Dict: API-specific kwargs for the model call
222250 """
223251
224252 final_model_kwargs = model_kwargs .copy ()
@@ -232,6 +260,8 @@ def convert_inputs_to_api_kwargs(
232260 elif model_type == ModelType .LLM :
233261 # convert input to messages
234262 messages : List [Dict [str , str ]] = []
263+ images = final_model_kwargs .pop ("images" , None )
264+ detail = final_model_kwargs .pop ("detail" , "auto" )
235265
236266 if self ._input_type == "messages" :
237267 system_start_tag = "<START_OF_SYSTEM_PROMPT>"
@@ -248,19 +278,74 @@ def convert_inputs_to_api_kwargs(
248278 if match :
249279 system_prompt = match .group (1 )
250280 input_str = match .group (2 )
251-
252281 else :
253282 print ("No match found." )
254283 if system_prompt and input_str :
255284 messages .append ({"role" : "system" , "content" : system_prompt })
256- messages .append ({"role" : "user" , "content" : input_str })
285+ if images :
286+ content = [{"type" : "text" , "text" : input_str }]
287+ if isinstance (images , (str , dict )):
288+ images = [images ]
289+ for img in images :
290+ content .append (self ._prepare_image_content (img , detail ))
291+ messages .append ({"role" : "user" , "content" : content })
292+ else :
293+ messages .append ({"role" : "user" , "content" : input_str })
257294 if len (messages ) == 0 :
258- messages .append ({"role" : "system" , "content" : input })
295+ if images :
296+ content = [{"type" : "text" , "text" : input }]
297+ if isinstance (images , (str , dict )):
298+ images = [images ]
299+ for img in images :
300+ content .append (self ._prepare_image_content (img , detail ))
301+ messages .append ({"role" : "user" , "content" : content })
302+ else :
303+ messages .append ({"role" : "system" , "content" : input })
259304 final_model_kwargs ["messages" ] = messages
305+ elif model_type == ModelType .IMAGE_GENERATION :
306+ # For image generation, input is the prompt
307+ final_model_kwargs ["prompt" ] = input
308+ # Ensure model is specified
309+ if "model" not in final_model_kwargs :
310+ raise ValueError ("model must be specified for image generation" )
311+ # Set defaults for DALL-E 3 if not specified
312+ final_model_kwargs ["size" ] = final_model_kwargs .get ("size" , "1024x1024" )
313+ final_model_kwargs ["quality" ] = final_model_kwargs .get (
314+ "quality" , "standard"
315+ )
316+ final_model_kwargs ["n" ] = final_model_kwargs .get ("n" , 1 )
317+ final_model_kwargs ["response_format" ] = final_model_kwargs .get (
318+ "response_format" , "url"
319+ )
320+
321+ # Handle image edits and variations
322+ image = final_model_kwargs .get ("image" )
323+ if isinstance (image , str ) and os .path .isfile (image ):
324+ final_model_kwargs ["image" ] = self ._encode_image (image )
325+
326+ mask = final_model_kwargs .get ("mask" )
327+ if isinstance (mask , str ) and os .path .isfile (mask ):
328+ final_model_kwargs ["mask" ] = self ._encode_image (mask )
260329 else :
261330 raise ValueError (f"model_type { model_type } is not supported" )
262331 return final_model_kwargs
263332
333+ def parse_image_generation_response (self , response : List [Image ]) -> GeneratorOutput :
334+ """Parse the image generation response into a GeneratorOutput."""
335+ try :
336+ # Extract URLs or base64 data from the response
337+ data = [img .url or img .b64_json for img in response ]
338+ # For single image responses, unwrap from list
339+ if len (data ) == 1 :
340+ data = data [0 ]
341+ return GeneratorOutput (
342+ data = data ,
343+ raw_response = str (response ),
344+ )
345+ except Exception as e :
346+ log .error (f"Error parsing image generation response: { e } " )
347+ return GeneratorOutput (data = None , error = str (e ), raw_response = str (response ))
348+
264349 @backoff .on_exception (
265350 backoff .expo ,
266351 (
@@ -285,6 +370,19 @@ def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINE
285370 self .chat_completion_parser = handle_streaming_response
286371 return self .sync_client .chat .completions .create (** api_kwargs )
287372 return self .sync_client .chat .completions .create (** api_kwargs )
373+ elif model_type == ModelType .IMAGE_GENERATION :
374+ # Determine which image API to call based on the presence of image/mask
375+ if "image" in api_kwargs :
376+ if "mask" in api_kwargs :
377+ # Image edit
378+ response = self .sync_client .images .edit (** api_kwargs )
379+ else :
380+ # Image variation
381+ response = self .sync_client .images .create_variation (** api_kwargs )
382+ else :
383+ # Image generation
384+ response = self .sync_client .images .generate (** api_kwargs )
385+ return response .data
288386 else :
289387 raise ValueError (f"model_type { model_type } is not supported" )
290388
@@ -311,6 +409,21 @@ async def acall(
311409 return await self .async_client .embeddings .create (** api_kwargs )
312410 elif model_type == ModelType .LLM :
313411 return await self .async_client .chat .completions .create (** api_kwargs )
412+ elif model_type == ModelType .IMAGE_GENERATION :
413+ # Determine which image API to call based on the presence of image/mask
414+ if "image" in api_kwargs :
415+ if "mask" in api_kwargs :
416+ # Image edit
417+ response = await self .async_client .images .edit (** api_kwargs )
418+ else :
419+ # Image variation
420+ response = await self .async_client .images .create_variation (
421+ ** api_kwargs
422+ )
423+ else :
424+ # Image generation
425+ response = await self .async_client .images .generate (** api_kwargs )
426+ return response .data
314427 else :
315428 raise ValueError (f"model_type { model_type } is not supported" )
316429
@@ -332,22 +445,74 @@ def to_dict(self) -> Dict[str, Any]:
332445 output = super ().to_dict (exclude = exclude )
333446 return output
334447
448+ def _encode_image (self , image_path : str ) -> str :
449+ """Encode image to base64 string.
450+
451+ Args:
452+ image_path: Path to image file.
453+
454+ Returns:
455+ Base64 encoded image string.
456+
457+ Raises:
458+ ValueError: If the file cannot be read or doesn't exist.
459+ """
460+ try :
461+ with open (image_path , "rb" ) as image_file :
462+ return base64 .b64encode (image_file .read ()).decode ("utf-8" )
463+ except FileNotFoundError :
464+ raise ValueError (f"Image file not found: { image_path } " )
465+ except PermissionError :
466+ raise ValueError (f"Permission denied when reading image file: { image_path } " )
467+ except Exception as e :
468+ raise ValueError (f"Error encoding image { image_path } : { str (e )} " )
469+
470+ def _prepare_image_content (
471+ self , image_source : Union [str , Dict [str , Any ]], detail : str = "auto"
472+ ) -> Dict [str , Any ]:
473+ """Prepare image content for API request.
474+
475+ Args:
476+ image_source: Either a path to local image or a URL.
477+ detail: Image detail level ('auto', 'low', or 'high').
335478
479+ Returns:
480+ Formatted image content for API request.
481+ """
482+ if isinstance (image_source , str ):
483+ if image_source .startswith (("http://" , "https://" )):
484+ return {
485+ "type" : "image_url" ,
486+ "image_url" : {"url" : image_source , "detail" : detail },
487+ }
488+ else :
489+ base64_image = self ._encode_image (image_source )
490+ return {
491+ "type" : "image_url" ,
492+ "image_url" : {
493+ "url" : f"data:image/jpeg;base64,{ base64_image } " ,
494+ "detail" : detail ,
495+ },
496+ }
497+ return image_source
498+
499+
500+ # Example usage:
336501# if __name__ == "__main__":
337502# from adalflow.core import Generator
338503# from adalflow.utils import setup_env, get_logger
339-
504+ #
340505# log = get_logger(level="DEBUG")
341-
506+ #
342507# setup_env()
343508# prompt_kwargs = {"input_str": "What is the meaning of life?"}
344-
509+ #
345510# gen = Generator(
346511# model_client=OpenAIClient(),
347512# model_kwargs={"model": "gpt-3.5-turbo", "stream": True},
348513# )
349514# gen_response = gen(prompt_kwargs)
350515# print(f"gen_response: {gen_response}")
351-
516+ #
352517# for genout in gen_response.data:
353518# print(f"genout: {genout}")
0 commit comments