5252log = logging .getLogger (__name__ )
5353T = TypeVar ("T" )
5454
55- # Models that support multimodal inputs
56- MULTIMODAL_MODELS = {
57- "gpt-4o" , # Versatile, high-intelligence flagship model
58- "gpt-4o-mini" , # Fast, affordable small model for focused tasks
59- "o1" , # Reasoning model that excels at complex, multi-step tasks
60- "o1-mini" , # Smaller reasoning model for complex tasks
61- }
62-
6355
6456# completion parsing functions and you can combine them into one singple chat completion parser
6557def get_first_message_content (completion : ChatCompletion ) -> str :
@@ -108,7 +100,7 @@ def get_probabilities(completion: ChatCompletion) -> List[List[TokenLogProb]]:
108100class OpenAIClient (ModelClient ):
109101 __doc__ = r"""A component wrapper for the OpenAI API client.
110102
111- Support both embedding and chat completion API.
103+ Support both embedding and chat completion API, including multimodal capabilities .
112104
113105 Users (1) simplify use ``Embedder`` and ``Generator`` components by passing OpenAIClient() as the model_client.
114106 (2) can use this as an example to create their own API client or extend this class(copying and modifing the code) in their own project.
@@ -119,6 +111,9 @@ class OpenAIClient(ModelClient):
119111 Instead
120112 - use :ref:`OutputParser<components-output_parsers>` for response parsing and formating.
121113
114+ For multimodal inputs, provide images in model_kwargs["images"] as a path, URL, or list of them.
115+ The model must support vision capabilities (e.g., gpt-4o, gpt-4o-mini, o1, o1-mini).
116+
122117 Args:
123118 api_key (Optional[str], optional): OpenAI API key. Defaults to None.
124119 chat_completion_parser (Callable[[Completion], Any], optional): A function to parse the chat completion to a str. Defaults to None.
@@ -127,6 +122,7 @@ class OpenAIClient(ModelClient):
127122 References:
128123 - Embeddings models: https://platform.openai.com/docs/guides/embeddings
129124 - Chat models: https://platform.openai.com/docs/guides/text-generation
125+ - Vision models: https://platform.openai.com/docs/guides/vision
130126 - OpenAI docs: https://platform.openai.com/docs/introduction
131127 """
132128
@@ -209,7 +205,7 @@ def track_completion_usage(
209205 def parse_embedding_response (
210206 self , response : CreateEmbeddingResponse
211207 ) -> EmbedderOutput :
212- r"""Parse the embedding response to a structure LightRAG components can understand.
208+ r"""Parse the embedding response to a structure Adalflow components can understand.
213209
214210 Should be called in ``Embedder``.
215211 """
@@ -227,7 +223,20 @@ def convert_inputs_to_api_kwargs(
227223 ) -> Dict :
228224 r"""
229225 Specify the API input type and output api_kwargs that will be used in _call and _acall methods.
230- Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format
226+ Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format.
227+ For multimodal inputs, images can be provided in model_kwargs["images"] as a string path, URL, or list of them.
228+ The model specified in model_kwargs["model"] must support multimodal capabilities when using images.
229+
230+ Args:
231+ input: The input text or messages to process
232+ model_kwargs: Additional parameters including:
233+ - images: Optional image source(s) as path, URL, or list of them
234+ - detail: Image detail level ('auto', 'low', or 'high'), defaults to 'auto'
235+ - model: The model to use (must support multimodal inputs if images are provided)
236+ model_type: The type of model (EMBEDDER or LLM)
237+
238+ Returns:
239+ Dict: API-specific kwargs for the model call
231240 """
232241
233242 final_model_kwargs = model_kwargs .copy ()
@@ -241,6 +250,8 @@ def convert_inputs_to_api_kwargs(
241250 elif model_type == ModelType .LLM :
242251 # convert input to messages
243252 messages : List [Dict [str , str ]] = []
253+ images = final_model_kwargs .pop ("images" , None )
254+ detail = final_model_kwargs .pop ("detail" , "auto" )
244255
245256 if self ._input_type == "messages" :
246257 system_start_tag = "<START_OF_SYSTEM_PROMPT>"
@@ -257,14 +268,29 @@ def convert_inputs_to_api_kwargs(
257268 if match :
258269 system_prompt = match .group (1 )
259270 input_str = match .group (2 )
260-
261271 else :
262272 print ("No match found." )
263273 if system_prompt and input_str :
264274 messages .append ({"role" : "system" , "content" : system_prompt })
265- messages .append ({"role" : "user" , "content" : input_str })
275+ if images :
276+ content = [{"type" : "text" , "text" : input_str }]
277+ if isinstance (images , (str , dict )):
278+ images = [images ]
279+ for img in images :
280+ content .append (self ._prepare_image_content (img , detail ))
281+ messages .append ({"role" : "user" , "content" : content })
282+ else :
283+ messages .append ({"role" : "user" , "content" : input_str })
266284 if len (messages ) == 0 :
267- messages .append ({"role" : "system" , "content" : input })
285+ if images :
286+ content = [{"type" : "text" , "text" : input }]
287+ if isinstance (images , (str , dict )):
288+ images = [images ]
289+ for img in images :
290+ content .append (self ._prepare_image_content (img , detail ))
291+ messages .append ({"role" : "user" , "content" : content })
292+ else :
293+ messages .append ({"role" : "system" , "content" : input })
268294 final_model_kwargs ["messages" ] = messages
269295 else :
270296 raise ValueError (f"model_type { model_type } is not supported" )
@@ -349,9 +375,19 @@ def _encode_image(self, image_path: str) -> str:
349375
350376 Returns:
351377 Base64 encoded image string.
378+
379+ Raises:
380+ ValueError: If the file cannot be read or doesn't exist.
352381 """
353- with open (image_path , "rb" ) as image_file :
354- return base64 .b64encode (image_file .read ()).decode ("utf-8" )
382+ try :
383+ with open (image_path , "rb" ) as image_file :
384+ return base64 .b64encode (image_file .read ()).decode ("utf-8" )
385+ except FileNotFoundError :
386+ raise ValueError (f"Image file not found: { image_path } " )
387+ except PermissionError :
388+ raise ValueError (f"Permission denied when reading image file: { image_path } " )
389+ except Exception as e :
390+ raise ValueError (f"Error encoding image { image_path } : { str (e )} " )
355391
356392 def _prepare_image_content (
357393 self , image_source : Union [str , Dict [str , Any ]], detail : str = "auto"
@@ -382,77 +418,23 @@ def _prepare_image_content(
382418 }
383419 return image_source
384420
385- def generate (
386- self ,
387- prompt : str ,
388- images : Optional [
389- Union [str , List [str ], Dict [str , Any ], List [Dict [str , Any ]]]
390- ] = None ,
391- model_kwargs : Optional [Dict [str , Any ]] = None ,
392- ) -> GeneratorOutput :
393- """Generate text response for given prompt and optionally images.
394-
395- Args:
396- prompt: Text prompt.
397- images: Optional image source(s) - can be path(s), URL(s), or formatted dict(s).
398- model_kwargs: Additional model parameters.
399-
400- Returns:
401- GeneratorOutput containing the model's response.
402- """
403- model_kwargs = model_kwargs or {}
404- model = model_kwargs .get ("model" , "gpt-4o-mini" )
405- max_tokens = model_kwargs .get ("max_tokens" , 300 )
406- detail = model_kwargs .get ("detail" , "auto" )
407-
408- # Check if model supports multimodal inputs when images are provided
409- if images and model not in MULTIMODAL_MODELS :
410- return GeneratorOutput (
411- error = f"Model { model } does not support multimodal inputs. Supported models: { MULTIMODAL_MODELS } "
412- )
413-
414- # Prepare message content
415- if images :
416- content = [{"type" : "text" , "text" : prompt }]
417- if not isinstance (images , list ):
418- images = [images ]
419- for img in images :
420- content .append (self ._prepare_image_content (img , detail ))
421- messages = [{"role" : "user" , "content" : content }]
422- else :
423- messages = [{"role" : "user" , "content" : prompt }]
424-
425- try :
426- response = self .client .chat .completions .create (
427- model = model ,
428- messages = messages ,
429- max_tokens = max_tokens ,
430- )
431- return GeneratorOutput (
432- id = response .id ,
433- data = response .choices [0 ].message .content ,
434- usage = response .usage .model_dump () if response .usage else None ,
435- raw_response = response .model_dump (),
436- )
437- except Exception as e :
438- return GeneratorOutput (error = str (e ))
439-
440421
422+ # Example usage:
441423# if __name__ == "__main__":
442424# from adalflow.core import Generator
443425# from adalflow.utils import setup_env, get_logger
444-
426+ #
445427# log = get_logger(level="DEBUG")
446-
428+ #
447429# setup_env()
448430# prompt_kwargs = {"input_str": "What is the meaning of life?"}
449-
431+ #
450432# gen = Generator(
451433# model_client=OpenAIClient(),
452434# model_kwargs={"model": "gpt-3.5-turbo", "stream": True},
453435# )
454436# gen_response = gen(prompt_kwargs)
455437# print(f"gen_response: {gen_response}")
456-
438+ #
457439# for genout in gen_response.data:
458440# print(f"genout: {genout}")
0 commit comments