multimodal kwargs update

fm1320 · fm1320 · commit 839d018d32df · 2025-01-25T20:04:27.000+01:00
diff --git a/adalflow/adalflow/components/model_client/openai_client.py b/adalflow/adalflow/components/model_client/openai_client.py
@@ -140,27 +140,27 @@ class OpenAIClient(ModelClient):
     def __init__(
         self,
         api_key: Optional[str] = None,
+        model_type: ModelType = ModelType.LLM,
         chat_completion_parser: Callable[[Completion], Any] = None,
         input_type: Literal["text", "messages"] = "text",
-        model_type: ModelType = ModelType.LLM,
     ):
-        r"""It is recommended to set the OPENAI_API_KEY environment variable instead of passing it as an argument.
+        r"""Initialize the OpenAI client.
 
         Args:
             api_key (Optional[str], optional): OpenAI API key. Defaults to None.
-            chat_completion_parser (Callable[[Completion], Any], optional): A function to parse the chat completion to a str. Defaults to None.
-            input_type (Literal["text", "messages"], optional): The type of input to use. Defaults to "text".
             model_type (ModelType, optional): The type of model to use (EMBEDDER, LLM, or IMAGE_GENERATION). Defaults to ModelType.LLM.
+            chat_completion_parser (Callable[[Completion], Any], optional): A function to parse chat completions. Defaults to None.
+            input_type (Literal["text", "messages"], optional): The type of input to use. Defaults to "text".
         """
         super().__init__()
         self._api_key = api_key
+        self.model_type = model_type
         self.sync_client = self.init_sync_client()
         self.async_client = None  # only initialize if the async call is called
         self.chat_completion_parser = (
             chat_completion_parser or get_first_message_content
         )
         self._input_type = input_type
-        self.model_type = model_type
 
     def init_sync_client(self):
         api_key = self._api_key or os.getenv("OPENAI_API_KEY")
@@ -235,6 +235,7 @@ def convert_inputs_to_api_kwargs(
         self,
         input: Optional[Any] = None,
         model_kwargs: Dict = {},
+        model_type: ModelType = ModelType.UNDEFINED,
     ) -> Dict:
         r"""
         Specify the API input type and output api_kwargs that will be used in _call and _acall methods.
@@ -259,6 +260,7 @@ def convert_inputs_to_api_kwargs(
                 - mask: Path to the mask image
                 For variations (DALL-E 2 only):
                 - image: Path to the input image
+            model_type: The type of model to use (EMBEDDER, LLM, or IMAGE_GENERATION)
 
         Returns:
             Dict: API-specific kwargs for the model call
@@ -397,6 +399,9 @@ def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINE
         """
         kwargs is the combined input and model_kwargs.  Support streaming call.
         """
+        # Use self.model_type if no model_type is provided or if UNDEFINED
+        model_type = self.model_type if model_type == ModelType.UNDEFINED else model_type
+        
         log.info(f"api_kwargs: {api_kwargs}")
         if model_type == ModelType.EMBEDDER:
             return self.sync_client.embeddings.create(**api_kwargs)
@@ -446,6 +451,9 @@ async def acall(
         """
         kwargs is the combined input and model_kwargs
         """
+        # Use self.model_type if no model_type is provided or if UNDEFINED
+        model_type = self.model_type if model_type == ModelType.UNDEFINED else model_type
+        
         if self.async_client is None:
             self.async_client = self.init_async_client()
         if model_type == ModelType.EMBEDDER:
diff --git a/adalflow/tests/test_openai_client.py b/adalflow/tests/test_openai_client.py
@@ -18,7 +18,12 @@ def getenv_side_effect(key):
 
 class TestOpenAIClient(unittest.IsolatedAsyncioTestCase):
     def setUp(self):
-        self.client = OpenAIClient(api_key="fake_api_key")
+        # Default client for LLM tests
+        self.client = OpenAIClient(api_key="fake_api_key", model_type=ModelType.LLM)
+        
+        # Client for image generation tests
+        self.image_client = OpenAIClient(api_key="fake_api_key", model_type=ModelType.IMAGE_GENERATION)
+        
         self.mock_response = {
             "id": "cmpl-3Q8Z5J9Z1Z5z5",
             "created": 1635820005,
@@ -152,7 +157,6 @@ def test_convert_inputs_to_api_kwargs_with_images(self):
         result = self.client.convert_inputs_to_api_kwargs(
             input="Describe this image",
             model_kwargs=model_kwargs,
-            model_type=ModelType.LLM,
         )
         expected_content = [
             {"type": "text", "text": "Describe this image"},
@@ -175,7 +179,6 @@ def test_convert_inputs_to_api_kwargs_with_images(self):
         result = self.client.convert_inputs_to_api_kwargs(
             input="Compare these images",
             model_kwargs=model_kwargs,
-            model_type=ModelType.LLM,
         )
         expected_content = [
             {"type": "text", "text": "Compare these images"},
@@ -202,15 +205,13 @@ async def test_acall_llm(self, MockAsyncOpenAI):
         MockAsyncOpenAI.return_value = mock_async_client
 
         # Mock the response
-
         mock_async_client.chat.completions.create = AsyncMock(
             return_value=self.mock_response
         )
 
         # Call the _acall method
-
         result = await self.client.acall(
-            api_kwargs=self.api_kwargs, model_type=ModelType.LLM
+            api_kwargs=self.api_kwargs,
         )
 
         # Assertions
@@ -236,7 +237,7 @@ def test_call(self, MockSyncOpenAI, mock_init_sync_client):
         self.client.sync_client = mock_sync_client
 
         # Call the call method
-        result = self.client.call(api_kwargs=self.api_kwargs, model_type=ModelType.LLM)
+        result = self.client.call(api_kwargs=self.api_kwargs)
 
         # Assertions
         mock_sync_client.chat.completions.create.assert_called_once_with(
@@ -264,7 +265,7 @@ async def test_acall_llm_with_vision(self, MockAsyncOpenAI):
 
         # Call the _acall method with vision model
         result = await self.client.acall(
-            api_kwargs=self.vision_api_kwargs, model_type=ModelType.LLM
+            api_kwargs=self.vision_api_kwargs,
         )
 
         # Assertions
@@ -293,7 +294,7 @@ def test_call_with_vision(self, MockSyncOpenAI, mock_init_sync_client):
 
         # Call the call method with vision model
         result = self.client.call(
-            api_kwargs=self.vision_api_kwargs, model_type=ModelType.LLM
+            api_kwargs=self.vision_api_kwargs,
         )
 
         # Assertions
@@ -314,10 +315,9 @@ def test_call_with_vision(self, MockSyncOpenAI, mock_init_sync_client):
 
     def test_convert_inputs_to_api_kwargs_for_image_generation(self):
         # Test basic image generation
-        result = self.client.convert_inputs_to_api_kwargs(
+        result = self.image_client.convert_inputs_to_api_kwargs(
             input="a white siamese cat",
             model_kwargs={"model": "dall-e-3"},
-            model_type=ModelType.IMAGE_GENERATION,
         )
         self.assertEqual(result["prompt"], "a white siamese cat")
         self.assertEqual(result["model"], "dall-e-3")
@@ -335,14 +335,13 @@ def test_convert_inputs_to_api_kwargs_for_image_generation(self):
             with open(test_mask, "wb") as f:
                 f.write(b"fake mask content")
 
-            result = self.client.convert_inputs_to_api_kwargs(
+            result = self.image_client.convert_inputs_to_api_kwargs(
                 input="a white siamese cat",
                 model_kwargs={
                     "model": "dall-e-2",
                     "image": test_image,
                     "mask": test_mask,
                 },
-                model_type=ModelType.IMAGE_GENERATION,
             )
             self.assertEqual(result["prompt"], "a white siamese cat")
             self.assertEqual(result["model"], "dall-e-2")
@@ -366,9 +365,8 @@ async def test_acall_image_generation(self, MockAsyncOpenAI):
         )
 
         # Call the acall method with image generation
-        result = await self.client.acall(
+        result = await self.image_client.acall(
             api_kwargs=self.image_generation_kwargs,
-            model_type=ModelType.IMAGE_GENERATION,
         )
 
         # Assertions
@@ -379,7 +377,7 @@ async def test_acall_image_generation(self, MockAsyncOpenAI):
         self.assertEqual(result, self.mock_image_response)
 
         # Test parse_image_generation_response
-        output = self.client.parse_image_generation_response(result)
+        output = self.image_client.parse_image_generation_response(result)
         self.assertTrue(isinstance(output, GeneratorOutput))
         self.assertEqual(output.data, "https://example.com/generated_image.jpg")
 
@@ -398,12 +396,11 @@ def test_call_image_generation(self, MockSyncOpenAI, mock_init_sync_client):
         )
 
         # Set the sync client
-        self.client.sync_client = mock_sync_client
+        self.image_client.sync_client = mock_sync_client
 
         # Call the call method with image generation
-        result = self.client.call(
+        result = self.image_client.call(
             api_kwargs=self.image_generation_kwargs,
-            model_type=ModelType.IMAGE_GENERATION,
         )
 
         # Assertions
@@ -413,7 +410,7 @@ def test_call_image_generation(self, MockSyncOpenAI, mock_init_sync_client):
         self.assertEqual(result, self.mock_image_response)
 
         # Test parse_image_generation_response
-        output = self.client.parse_image_generation_response(result)
+        output = self.image_client.parse_image_generation_response(result)
         self.assertTrue(isinstance(output, GeneratorOutput))
         self.assertEqual(output.data, "https://example.com/generated_image.jpg")
 
diff --git a/docs/source/tutorials/multimodal_client.rst b/docs/source/tutorials/multimodal_client.rst
@@ -0,0 +1,164 @@
+Multimodal Client Tutorial
+=======================
+
+This tutorial demonstrates how to use the OpenAI client for different types of tasks: text generation, vision analysis, and image generation.
+
+Basic Setup
+----------
+
+First, make sure you have your OpenAI API key set in your environment:
+
+.. code-block:: bash
+
+    export OPENAI_API_KEY='your_api_key_here'
+
+The OpenAI client supports three different model types:
+
+- ``ModelType.LLM`` - For text generation and vision tasks (default)
+- ``ModelType.IMAGE_GENERATION`` - For DALL-E image generation
+- ``ModelType.EMBEDDER`` - For text embeddings
+
+Note that most recent OpenAI models (like GPT-4) support both text and vision tasks by default, so you can use the same client for both.
+
+Text and Vision Tasks
+------------------
+
+For text generation and vision tasks, you can use the default ``ModelType.LLM`` with any OpenAI multimodal model:
+
+.. code-block:: python
+
+    from adalflow.core import Generator
+    from adalflow.components.model_client import OpenAIClient
+    from adalflow.core.types import ModelType
+
+    # Default model_type is LLM
+    client = OpenAIClient()
+    generator = Generator(
+        model_client=client,
+        model_kwargs={"model": "gpt-4", "max_tokens": 100}
+    )
+
+    # Text generation
+    text_response = generator({"input_str": "Hello, world!"})
+    print(f"Text Response: {text_response.raw_response}")
+
+    # Vision analysis with the same client
+    vision_response = generator(
+        prompt_kwargs={"input_str": "What do you see in this image?"},
+        model_kwargs={
+            "model": "gpt-4",  # Same model can handle both text and images
+            "images": "https://example.com/image.jpg",
+            "max_tokens": 300,
+        },
+    )
+    print(f"Vision Response: {vision_response.raw_response}")
+
+Image Generation
+--------------
+
+For DALL-E image generation, explicitly set ``ModelType.IMAGE_GENERATION``:
+
+.. code-block:: python
+
+    dalle_client = OpenAIClient(model_type=ModelType.IMAGE_GENERATION)
+    dalle_gen = Generator(
+        model_client=dalle_client,
+        model_kwargs={
+            "model": "dall-e-3",
+            "size": "1024x1024",
+            "quality": "standard",
+            "n": 1,
+        },
+    )
+
+    # For image generation, input_str becomes the prompt
+    response = dalle_gen(
+        {"input_str": "A happy siamese cat playing with a red ball of yarn"}
+    )
+    print(f"Generated Image URL: {response.data}")
+
+Error Handling
+------------
+
+The client includes built-in error handling for common issues:
+
+1. Invalid Image URLs:
+
+.. code-block:: python
+
+    # The client will properly handle invalid image URLs
+    gen = Generator(
+        model_client=client,
+        model_kwargs={
+            "model": "gpt-4",
+            "images": "https://invalid.url/nonexistent.jpg",
+        },
+    )
+    response = gen({"input_str": "What do you see?"})
+    # Will return GeneratorOutput with error information
+
+2. Invalid Parameters:
+
+.. code-block:: python
+
+    # The client will catch invalid parameters
+    gen = Generator(
+        model_client=dalle_client,
+        model_kwargs={
+            "model": "dall-e-3",
+            "size": "invalid_size",  # Invalid size
+        },
+    )
+    response = gen({"input_str": "A cat"})
+    # Will return GeneratorOutput with error information
+
+Response Structure
+---------------
+
+All responses are returned as ``GeneratorOutput`` objects with these fields:
+
+- ``data``: The processed output (e.g., generated text, image URL)
+- ``raw_response``: The raw response string
+- ``error``: Any error messages (None if successful)
+- ``usage``: Token usage information for text generation
+- ``metadata``: Additional metadata (if any)
+
+For example:
+
+.. code-block:: python
+
+    # Text/Vision response
+    GeneratorOutput(
+        data='Hello! How can I assist you today?',
+        error=None,
+        usage=CompletionUsage(completion_tokens=10, prompt_tokens=45, total_tokens=55),
+        raw_response='Hello! How can I assist you today?'
+    )
+
+    # Image generation response
+    GeneratorOutput(
+        data='https://...image-url...',
+        error=None,
+        raw_response='[Image(url="https://...")]'
+    )
+
+Best Practices
+------------
+
+1. **Model Type Selection**:
+   - Use the default ``ModelType.LLM`` for text and vision tasks (they use the same models)
+   - Explicitly set ``ModelType.IMAGE_GENERATION`` only for DALL-E
+   - Use ``ModelType.EMBEDDER`` for embedding generation
+
+2. **Error Handling**:
+   - Always check the ``error`` field in the response
+   - Handle both API errors and invalid parameter errors
+
+3. **Resource Management**:
+   - Monitor token usage through the ``usage`` field
+   - Be mindful of image sizes and quality settings for DALL-E
+
+4. **Model Selection**:
+   - Most recent OpenAI models support both text and vision:
+     - ``gpt-4`` for both text and vision tasks
+     - ``dall-e-3`` or ``dall-e-2`` for image generation