Single function openaiclient and test

fm1320 · fm1320 · commit 00ea1d5e8582 · 2025-01-08T04:05:11.000Z
diff --git a/adalflow/adalflow/components/model_client/openai_client.py b/adalflow/adalflow/components/model_client/openai_client.py
@@ -52,14 +52,6 @@
 log = logging.getLogger(__name__)
 T = TypeVar("T")
 
-# Models that support multimodal inputs
-MULTIMODAL_MODELS = {
-    "gpt-4o",  # Versatile, high-intelligence flagship model
-    "gpt-4o-mini",  # Fast, affordable small model for focused tasks
-    "o1",  # Reasoning model that excels at complex, multi-step tasks
-    "o1-mini",  # Smaller reasoning model for complex tasks
-}
-
 
 # completion parsing functions and you can combine them into one singple chat completion parser
 def get_first_message_content(completion: ChatCompletion) -> str:
@@ -108,7 +100,7 @@ def get_probabilities(completion: ChatCompletion) -> List[List[TokenLogProb]]:
 class OpenAIClient(ModelClient):
     __doc__ = r"""A component wrapper for the OpenAI API client.
 
-    Support both embedding and chat completion API.
+    Support both embedding and chat completion API, including multimodal capabilities.
 
     Users (1) simplify use ``Embedder`` and ``Generator`` components by passing OpenAIClient() as the model_client.
     (2) can use this as an example to create their own API client or extend this class(copying and modifing the code) in their own project.
@@ -119,6 +111,9 @@ class OpenAIClient(ModelClient):
         Instead
         - use :ref:`OutputParser<components-output_parsers>` for response parsing and formating.
 
+        For multimodal inputs, provide images in model_kwargs["images"] as a path, URL, or list of them.
+        The model must support vision capabilities (e.g., gpt-4o, gpt-4o-mini, o1, o1-mini).
+
     Args:
         api_key (Optional[str], optional): OpenAI API key. Defaults to None.
         chat_completion_parser (Callable[[Completion], Any], optional): A function to parse the chat completion to a str. Defaults to None.
@@ -127,6 +122,7 @@ class OpenAIClient(ModelClient):
     References:
         - Embeddings models: https://platform.openai.com/docs/guides/embeddings
         - Chat models: https://platform.openai.com/docs/guides/text-generation
+        - Vision models: https://platform.openai.com/docs/guides/vision
         - OpenAI docs: https://platform.openai.com/docs/introduction
     """
 
@@ -209,7 +205,7 @@ def track_completion_usage(
     def parse_embedding_response(
         self, response: CreateEmbeddingResponse
     ) -> EmbedderOutput:
-        r"""Parse the embedding response to a structure LightRAG components can understand.
+        r"""Parse the embedding response to a structure Adalflow components can understand.
 
         Should be called in ``Embedder``.
         """
@@ -227,7 +223,20 @@ def convert_inputs_to_api_kwargs(
     ) -> Dict:
         r"""
         Specify the API input type and output api_kwargs that will be used in _call and _acall methods.
-        Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format
+        Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format.
+        For multimodal inputs, images can be provided in model_kwargs["images"] as a string path, URL, or list of them.
+        The model specified in model_kwargs["model"] must support multimodal capabilities when using images.
+
+        Args:
+            input: The input text or messages to process
+            model_kwargs: Additional parameters including:
+                - images: Optional image source(s) as path, URL, or list of them
+                - detail: Image detail level ('auto', 'low', or 'high'), defaults to 'auto'
+                - model: The model to use (must support multimodal inputs if images are provided)
+            model_type: The type of model (EMBEDDER or LLM)
+
+        Returns:
+            Dict: API-specific kwargs for the model call
         """
 
         final_model_kwargs = model_kwargs.copy()
@@ -241,6 +250,8 @@ def convert_inputs_to_api_kwargs(
         elif model_type == ModelType.LLM:
             # convert input to messages
             messages: List[Dict[str, str]] = []
+            images = final_model_kwargs.pop("images", None)
+            detail = final_model_kwargs.pop("detail", "auto")
 
             if self._input_type == "messages":
                 system_start_tag = "<START_OF_SYSTEM_PROMPT>"
@@ -257,14 +268,29 @@ def convert_inputs_to_api_kwargs(
                 if match:
                     system_prompt = match.group(1)
                     input_str = match.group(2)
-
                 else:
                     print("No match found.")
                 if system_prompt and input_str:
                     messages.append({"role": "system", "content": system_prompt})
-                    messages.append({"role": "user", "content": input_str})
+                    if images:
+                        content = [{"type": "text", "text": input_str}]
+                        if isinstance(images, (str, dict)):
+                            images = [images]
+                        for img in images:
+                            content.append(self._prepare_image_content(img, detail))
+                        messages.append({"role": "user", "content": content})
+                    else:
+                        messages.append({"role": "user", "content": input_str})
             if len(messages) == 0:
-                messages.append({"role": "system", "content": input})
+                if images:
+                    content = [{"type": "text", "text": input}]
+                    if isinstance(images, (str, dict)):
+                        images = [images]
+                    for img in images:
+                        content.append(self._prepare_image_content(img, detail))
+                    messages.append({"role": "user", "content": content})
+                else:
+                    messages.append({"role": "system", "content": input})
             final_model_kwargs["messages"] = messages
         else:
             raise ValueError(f"model_type {model_type} is not supported")
@@ -349,9 +375,19 @@ def _encode_image(self, image_path: str) -> str:
 
         Returns:
             Base64 encoded image string.
+
+        Raises:
+            ValueError: If the file cannot be read or doesn't exist.
         """
-        with open(image_path, "rb") as image_file:
-            return base64.b64encode(image_file.read()).decode("utf-8")
+        try:
+            with open(image_path, "rb") as image_file:
+                return base64.b64encode(image_file.read()).decode("utf-8")
+        except FileNotFoundError:
+            raise ValueError(f"Image file not found: {image_path}")
+        except PermissionError:
+            raise ValueError(f"Permission denied when reading image file: {image_path}")
+        except Exception as e:
+            raise ValueError(f"Error encoding image {image_path}: {str(e)}")
 
     def _prepare_image_content(
         self, image_source: Union[str, Dict[str, Any]], detail: str = "auto"
@@ -382,77 +418,23 @@ def _prepare_image_content(
                 }
         return image_source
 
-    def generate(
-        self,
-        prompt: str,
-        images: Optional[
-            Union[str, List[str], Dict[str, Any], List[Dict[str, Any]]]
-        ] = None,
-        model_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> GeneratorOutput:
-        """Generate text response for given prompt and optionally images.
-
-        Args:
-            prompt: Text prompt.
-            images: Optional image source(s) - can be path(s), URL(s), or formatted dict(s).
-            model_kwargs: Additional model parameters.
-
-        Returns:
-            GeneratorOutput containing the model's response.
-        """
-        model_kwargs = model_kwargs or {}
-        model = model_kwargs.get("model", "gpt-4o-mini")
-        max_tokens = model_kwargs.get("max_tokens", 300)
-        detail = model_kwargs.get("detail", "auto")
-
-        # Check if model supports multimodal inputs when images are provided
-        if images and model not in MULTIMODAL_MODELS:
-            return GeneratorOutput(
-                error=f"Model {model} does not support multimodal inputs. Supported models: {MULTIMODAL_MODELS}"
-            )
-
-        # Prepare message content
-        if images:
-            content = [{"type": "text", "text": prompt}]
-            if not isinstance(images, list):
-                images = [images]
-            for img in images:
-                content.append(self._prepare_image_content(img, detail))
-            messages = [{"role": "user", "content": content}]
-        else:
-            messages = [{"role": "user", "content": prompt}]
-
-        try:
-            response = self.client.chat.completions.create(
-                model=model,
-                messages=messages,
-                max_tokens=max_tokens,
-            )
-            return GeneratorOutput(
-                id=response.id,
-                data=response.choices[0].message.content,
-                usage=response.usage.model_dump() if response.usage else None,
-                raw_response=response.model_dump(),
-            )
-        except Exception as e:
-            return GeneratorOutput(error=str(e))
-
 
+# Example usage:
 # if __name__ == "__main__":
 #     from adalflow.core import Generator
 #     from adalflow.utils import setup_env, get_logger
-
+#
 #     log = get_logger(level="DEBUG")
-
+#
 #     setup_env()
 #     prompt_kwargs = {"input_str": "What is the meaning of life?"}
-
+#
 #     gen = Generator(
 #         model_client=OpenAIClient(),
 #         model_kwargs={"model": "gpt-3.5-turbo", "stream": True},
 #     )
 #     gen_response = gen(prompt_kwargs)
 #     print(f"gen_response: {gen_response}")
-
+#
 #     for genout in gen_response.data:
 #         print(f"genout: {genout}")
diff --git a/adalflow/adalflow/utils/lazy_import.py b/adalflow/adalflow/utils/lazy_import.py
@@ -215,13 +215,3 @@ def safe_import(
             raise ImportError(f"{install_message}")
 
     return return_modules[0] if len(return_modules) == 1 else return_modules
-
-
-OPTIONAL_PACKAGES = {
-    "openai": "openai",  # For OpenAI API clients
-    "transformers": "transformers",  # For local models
-    "torch": "torch",  # For PyTorch models
-    "anthropic": "anthropic",  # For Claude models
-    "groq": "groq",  # For Groq models
-    "cohere": "cohere",  # For Cohere models
-}
diff --git a/adalflow/tests/test_openai_client.py b/adalflow/tests/test_openai_client.py
@@ -1,5 +1,7 @@
 import unittest
 from unittest.mock import patch, AsyncMock, Mock
+import os
+import base64
 
 from openai.types import CompletionUsage
 from openai.types.chat import ChatCompletion
@@ -42,6 +44,105 @@ def setUp(self):
             "model": "gpt-3.5-turbo",
         }
 
+    def test_encode_image(self):
+        # Create a temporary test image file
+        test_image_path = "test_image.jpg"
+        test_content = b"fake image content"
+        try:
+            with open(test_image_path, "wb") as f:
+                f.write(test_content)
+
+            # Test successful encoding
+            encoded = self.client._encode_image(test_image_path)
+            self.assertEqual(encoded, base64.b64encode(test_content).decode("utf-8"))
+
+            # Test file not found
+            with self.assertRaises(ValueError) as context:
+                self.client._encode_image("nonexistent.jpg")
+            self.assertIn("Image file not found", str(context.exception))
+
+        finally:
+            # Cleanup
+            if os.path.exists(test_image_path):
+                os.remove(test_image_path)
+
+    def test_prepare_image_content(self):
+        # Test URL image
+        url = "https://example.com/image.jpg"
+        result = self.client._prepare_image_content(url)
+        self.assertEqual(
+            result,
+            {"type": "image_url", "image_url": {"url": url, "detail": "auto"}},
+        )
+
+        # Test with custom detail level
+        result = self.client._prepare_image_content(url, detail="high")
+        self.assertEqual(
+            result,
+            {"type": "image_url", "image_url": {"url": url, "detail": "high"}},
+        )
+
+        # Test with pre-formatted content
+        pre_formatted = {
+            "type": "image_url",
+            "image_url": {"url": url, "detail": "low"},
+        }
+        result = self.client._prepare_image_content(pre_formatted)
+        self.assertEqual(result, pre_formatted)
+
+    def test_convert_inputs_to_api_kwargs_with_images(self):
+        # Test with single image URL
+        model_kwargs = {
+            "model": "gpt-4-vision-preview",
+            "images": "https://example.com/image.jpg",
+        }
+        result = self.client.convert_inputs_to_api_kwargs(
+            input="Describe this image",
+            model_kwargs=model_kwargs,
+            model_type=ModelType.LLM,
+        )
+        expected_content = [
+            {"type": "text", "text": "Describe this image"},
+            {
+                "type": "image_url",
+                "image_url": {"url": "https://example.com/image.jpg", "detail": "auto"},
+            },
+        ]
+        self.assertEqual(result["messages"][0]["content"], expected_content)
+
+        # Test with multiple images
+        model_kwargs = {
+            "model": "gpt-4-vision-preview",
+            "images": [
+                "https://example.com/image1.jpg",
+                "https://example.com/image2.jpg",
+            ],
+            "detail": "high",
+        }
+        result = self.client.convert_inputs_to_api_kwargs(
+            input="Compare these images",
+            model_kwargs=model_kwargs,
+            model_type=ModelType.LLM,
+        )
+        expected_content = [
+            {"type": "text", "text": "Compare these images"},
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://example.com/image1.jpg",
+                    "detail": "high",
+                },
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": "https://example.com/image2.jpg",
+                    "detail": "high",
+                },
+            },
+        ]
+        self.assertEqual(result["messages"][0]["content"], expected_content)
+
     @patch("adalflow.components.model_client.openai_client.AsyncOpenAI")
     async def test_acall_llm(self, MockAsyncOpenAI):
         mock_async_client = AsyncMock()
diff --git a/docs/source/tutorials/model_client.rst b/docs/source/tutorials/model_client.rst
@@ -1513,6 +1513,46 @@ This is the function call that triggers the execution of the custom model client
 
     build_custom_model_client()
 
+
+OPENAI LLM Chat - Multimodal Example
+-------------------------------------------------
+
+The OpenAI client also supports multimodal inputs. Here's a quick example:
+
+.. code-block:: python
+
+    from adalflow import Generator, OpenAIClient
+
+    generator = Generator(
+        model_client=OpenAIClient(),
+        model_kwargs={
+            "model": "gpt-4o",
+            "max_tokens": 300
+        }
+    )
+
+    # Single image
+    response = generator(
+        prompt_kwargs={
+            "input_str": "What's in this image?",
+            "images": "path/to/image.jpg"  # Local file or URL
+        }
+    )
+
+    # Multiple images
+    response = generator(
+        prompt_kwargs={
+            "input_str": "Compare these images.",
+            "images": [
+                "path/to/first.jpg",
+                "https://example.com/second.jpg"
+            ]
+        }
+    )
+
+The client handles both local files and URLs, with support for PNG, JPEG, WEBP, and non-animated GIF formats.
+
+
 .. admonition:: API reference
    :class: highlight
 
diff --git a/docs/source/tutorials/multimodal.rst b/docs/source/tutorials/multimodal.rst