Update .rst file and colab

fm1320 · fm1320 · commit 5144fc4f2659 · 2025-01-10T01:37:16.000Z
diff --git a/docs/source/tutorials/model_client.rst b/docs/source/tutorials/model_client.rst
@@ -1552,6 +1552,71 @@ The OpenAI client also supports multimodal inputs. Here's a quick example:
 
 The client handles both local files and URLs, with support for PNG, JPEG, WEBP, and non-animated GIF formats.
 
+OPENAI Image Generation
+-------------------------------------------------
+
+The OpenAI client supports image generation, editing, and variation creation through DALL-E models. First, you need to define a Generator class with the correct model type:
+
+.. code-block:: python
+
+    from adalflow import Generator
+    from adalflow.core.types import ModelType
+
+    class ImageGenerator(Generator):
+        """Generator subclass for image generation."""
+        model_type = ModelType.IMAGE_GENERATION
+
+Then you can use it like this:
+
+.. code-block:: python
+
+    from adalflow import OpenAIClient
+
+    generator = ImageGenerator(
+        model_client=OpenAIClient(),
+        model_kwargs={
+            "model": "dall-e-3",  # or "dall-e-2"
+            "size": "1024x1024",  # "1024x1024", "1024x1792", or "1792x1024" for DALL-E 3
+            "quality": "standard",  # "standard" or "hd" (DALL-E 3 only)
+            "n": 1  # Number of images (1 for DALL-E 3, 1-10 for DALL-E 2)
+        }
+    )
+
+    # Generate an image from text
+    response = generator(
+        prompt_kwargs={"input_str": "A white siamese cat in a space suit"}
+    )
+    # response.data will contain the image URL
+
+    # Edit an existing image
+    response = generator(
+        prompt_kwargs={"input_str": "Add a red hat"},
+        model_kwargs={
+            "model": "dall-e-2",
+            "image": "path/to/cat.png",  # Original image
+            "mask": "path/to/mask.png"   # Optional mask showing where to edit
+        }
+    )
+
+    # Create variations of an image
+    response = generator(
+        prompt_kwargs={"input_str": None},  # Not needed for variations
+        model_kwargs={
+            "model": "dall-e-2",
+            "image": "path/to/cat.png"  # Image to create variations of
+        }
+    )
+
+The client supports:
+
+- Image generation from text descriptions using DALL-E 3 or DALL-E 2
+- Image editing with optional masking (DALL-E 2)
+- Creating variations of existing images (DALL-E 2)
+- Both local file paths and base64-encoded images
+- Various image sizes and quality settings
+- Multiple output formats (URL or base64)
+
+The response will always be wrapped in a ``GeneratorOutput`` object, maintaining consistency with other AdalFlow operations. The generated image(s) will be available in the ``data`` field as either a URL or base64 string.
 
 .. admonition:: API reference
    :class: highlight
@@ -1563,3 +1628,4 @@ The client handles both local files and URLs, with support for PNG, JPEG, WEBP,
    - :class:`components.model_client.anthropic_client.AnthropicAPIClient`
    - :class:`components.model_client.google_client.GoogleGenAIClient`
    - :class:`components.model_client.cohere_client.CohereAPIClient`
+
diff --git a/notebooks/tutorials/adalflow_modelclient.ipynb b/notebooks/tutorials/adalflow_modelclient.ipynb
@@ -2043,6 +2043,272 @@
     "build_custom_model_client()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Adalflow multimodal model client"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def analyze_single_image():\n",
+    "    \"\"\"Example of analyzing a single image with GPT-4 Vision\"\"\"\n",
+    "    client = OpenAIClient()\n",
+    "    \n",
+    "    gen = Generator(\n",
+    "        model_client=client,\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"gpt-4o-mini\",\n",
+    "            \"images\": \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/images/happy_cat.jpg\",\n",
+    "            \"max_tokens\": 300\n",
+    "        }\n",
+    "    )\n",
+    "    \n",
+    "    response = gen({\"input_str\": \"What do you see in this image? Be detailed but concise.\"})\n",
+    "    print(\"\\n=== Single Image Analysis ===\")\n",
+    "    print(f\"Description: {response.raw_response}\")\n",
+    "\n",
+    "def analyze_multiple_images():\n",
+    "    \"\"\"Example of analyzing multiple images in one prompt\"\"\"\n",
+    "    client = OpenAIClient()\n",
+    "    \n",
+    "    # List of images to analyze together\n",
+    "    images = [\n",
+    "        \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/images/happy_cat.jpg\",\n",
+    "        \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/images/sad_cat.jpg\"\n",
+    "    ]\n",
+    "    \n",
+    "    gen = Generator(\n",
+    "        model_client=client,\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"gpt-4o-mini\",\n",
+    "            \"images\": images,\n",
+    "            \"max_tokens\": 300\n",
+    "        }\n",
+    "    )\n",
+    "    \n",
+    "    response = gen({\"input_str\": \"Compare and contrast these two images. What are the main differences?\"})\n",
+    "    print(\"\\n=== Multiple Images Analysis ===\")\n",
+    "    print(f\"Comparison: {response.raw_response}\")\n",
+    "\n",
+    "def generate_art_with_dalle():\n",
+    "    \"\"\"Example of generating art using DALL-E 3\"\"\"\n",
+    "    client = OpenAIClient()\n",
+    "    \n",
+    "    gen = Generator(\n",
+    "        model_client=client,\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"dall-e-3\",\n",
+    "            \"size\": \"1024x1024\",\n",
+    "            \"quality\": \"standard\",\n",
+    "            \"n\": 1\n",
+    "        }\n",
+    "    )\n",
+    "    \n",
+    "    response = gen({\n",
+    "        \"input_str\": \"A serene Japanese garden with a small bridge over a koi pond, cherry blossoms falling gently in the breeze\"\n",
+    "    })\n",
+    "    print(\"\\n=== Art Generation with DALL-E 3 ===\")\n",
+    "    print(f\"Generated Image URL: {response.data}\")\n",
+    "\n",
+    "def create_image_variations(image_path=\"path/to/your/image.jpg\"):\n",
+    "    \"\"\"Example of creating variations of an existing image\"\"\"\n",
+    "    client = OpenAIClient()\n",
+    "    \n",
+    "    gen = Generator(\n",
+    "        model_client=client,\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"dall-e-2\",\n",
+    "            \"image\": image_path,\n",
+    "            \"n\": 2,  # Generate 2 variations\n",
+    "            \"size\": \"1024x1024\"\n",
+    "        }\n",
+    "    )\n",
+    "    \n",
+    "    response = gen({\"input_str\": \"\"})\n",
+    "    print(\"\\n=== Image Variations ===\")\n",
+    "    print(f\"Variation URLs: {response.data}\")\n",
+    "\n",
+    "def edit_image_with_mask(image_path=\"path/to/image.jpg\", mask_path=\"path/to/mask.jpg\"):\n",
+    "    \"\"\"Example of editing specific parts of an image using a mask\"\"\"\n",
+    "    client = OpenAIClient()\n",
+    "    \n",
+    "    gen = Generator(\n",
+    "        model_client=client,\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"dall-e-2\",\n",
+    "            \"image\": image_path,\n",
+    "            \"mask\": mask_path,\n",
+    "            \"n\": 1,\n",
+    "            \"size\": \"1024x1024\"\n",
+    "        }\n",
+    "    )\n",
+    "    \n",
+    "    response = gen({\n",
+    "        \"input_str\": \"Replace the masked area with a beautiful sunset\"\n",
+    "    })\n",
+    "    print(\"\\n=== Image Editing ===\")\n",
+    "    print(f\"Edited Image URL: {response.data}\")\n",
+    "\n",
+    "def mixed_image_text_conversation():\n",
+    "    \"\"\"Example of having a conversation that includes both images and text\"\"\"\n",
+    "    client = OpenAIClient()\n",
+    "    \n",
+    "    gen = Generator(\n",
+    "        model_client=client,\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"gpt-4o-mini\",\n",
+    "            \"images\": [\n",
+    "                \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/images/happy_cat.jpg\",\n",
+    "                \"https://path/to/local/image.jpg\"  # Replace with your local image path\n",
+    "            ],\n",
+    "            \"max_tokens\": 300\n",
+    "        }\n",
+    "    )\n",
+    "    \n",
+    "    conversation = \"\"\"<START_OF_SYSTEM_PROMPT>You are a helpful assistant skilled in analyzing images and providing detailed descriptions.</END_OF_SYSTEM_PROMPT>\n",
+    "    <START_OF_USER_PROMPT>I'm showing you two images. Please analyze them and tell me what emotions they convey.</END_OF_USER_PROMPT>\"\"\"\n",
+    "    \n",
+    "    response = gen({\"input_str\": conversation})\n",
+    "    print(\"\\n=== Mixed Image-Text Conversation ===\")\n",
+    "    print(f\"Assistant's Analysis: {response.raw_response}\")\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    print(\"OpenAI Image Processing Examples\\n\")\n",
+    "    \n",
+    "    # Basic image analysis\n",
+    "    analyze_single_image()\n",
+    "    \n",
+    "    # Multiple image analysis\n",
+    "    analyze_multiple_images()\n",
+    "    \n",
+    "    # Image generation\n",
+    "    generate_art_with_dalle()\n",
+    "    \n",
+    "    # create_image_variations(<path_to_image>)\n",
+    "    # edit_image_with_mask(<path_to_image>, <path_to_mask>)\n",
+    "    # mixed_image_text_conversation(<conversation_prompt>)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Image generation with Dall E and image understanding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from adalflow.core import Generator\n",
+    "from adalflow.components.model_client.openai_client import OpenAIClient\n",
+    "from adalflow.core.types import ModelType"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ImageGenerator(Generator):\n",
+    "    \"\"\"Generator subclass for image generation.\"\"\"\n",
+    "    model_type = ModelType.IMAGE_GENERATION\n",
+    "\n",
+    "def test_vision_and_generation():\n",
+    "    \"\"\"Test both vision analysis and image generation\"\"\"\n",
+    "    client = OpenAIClient()\n",
+    "    \n",
+    "    # 1. Test Vision Analysis\n",
+    "    vision_gen = Generator(\n",
+    "        model_client=client,\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"gpt-4o-mini\",\n",
+    "            \"images\": \"https://upload.wikimedia.org/wikipedia/en/7/7d/Lenna_%28test_image%29.png\",\n",
+    "            \"max_tokens\": 300\n",
+    "        }\n",
+    "    )\n",
+    "    \n",
+    "    vision_response = vision_gen({\"input_str\": \"What do you see in this image? Be detailed but concise.\"})\n",
+    "    print(\"\\n=== Vision Analysis ===\")\n",
+    "    print(f\"Description: {vision_response.raw_response}\")\n",
+    "\n",
+    "    # 2. Test DALL-E Image Generation\n",
+    "    dalle_gen = ImageGenerator(\n",
+    "        model_client=client,\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"dall-e-3\",\n",
+    "            \"size\": \"1024x1024\",\n",
+    "            \"quality\": \"standard\",\n",
+    "            \"n\": 1\n",
+    "        }\n",
+    "    )\n",
+    "    \n",
+    "    # For image generation, input_str becomes the prompt\n",
+    "    response = dalle_gen({\"input_str\": \"A happy siamese cat playing with a red ball of yarn\"})\n",
+    "    print(\"\\n=== DALL-E Generation ===\")\n",
+    "    print(f\"Generated Image URL: {response.data}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Invalid image url - Generator output still works!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test_invalid_image_url():\n",
+    "    \"\"\"Test Generator output with invalid image URL\"\"\"\n",
+    "    client = OpenAIClient()\n",
+    "    gen = Generator(\n",
+    "        model_client=client,\n",
+    "        model_kwargs={\n",
+    "            \"model\": \"gpt-4o-mini\",\n",
+    "            \"images\": \"https://invalid.url/nonexistent.jpg\",\n",
+    "            \"max_tokens\": 300\n",
+    "        }\n",
+    "    )\n",
+    "    \n",
+    "    print(\"\\n=== Testing Invalid Image URL ===\")\n",
+    "    response = gen({\"input_str\": \"What do you see in this image?\"})\n",
+    "    print(f\"Response with invalid image URL: {response}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    print(\"Starting OpenAI Vision and DALL-E test...\\n\")\n",
+    "    test_invalid_image_url()\n",
+    "    test_vision_and_generation() "
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {