no subclass for image generation proposal

fm1320 · fm1320 · commit fd8097496ab1 · 2025-01-16T23:54:00.000Z
diff --git a/adalflow/adalflow/components/model_client/openai_client.py b/adalflow/adalflow/components/model_client/openai_client.py
@@ -243,7 +243,18 @@ def convert_inputs_to_api_kwargs(
                 - images: Optional image source(s) as path, URL, or list of them
                 - detail: Image detail level ('auto', 'low', or 'high'), defaults to 'auto'
                 - model: The model to use (must support multimodal inputs if images are provided)
-            model_type: The type of model (EMBEDDER or LLM)
+                For image generation:
+                - model: "dall-e-3" or "dall-e-2"
+                - size: "1024x1024", "1024x1792", or "1792x1024" for DALL-E 3; "256x256", "512x512", or "1024x1024" for DALL-E 2
+                - quality: "standard" or "hd" (DALL-E 3 only)
+                - n: Number of images (1 for DALL-E 3, 1-10 for DALL-E 2)
+                - response_format: "url" or "b64_json"
+                For image edits (DALL-E 2 only):
+                - image: Path to the input image
+                - mask: Path to the mask image
+                For variations (DALL-E 2 only):
+                - image: Path to the input image
+            model_type: The type of model (EMBEDDER, LLM, or IMAGE_GENERATION)
 
         Returns:
             Dict: API-specific kwargs for the model call
@@ -308,20 +319,44 @@ def convert_inputs_to_api_kwargs(
             # Ensure model is specified
             if "model" not in final_model_kwargs:
                 raise ValueError("model must be specified for image generation")
-            # Set defaults for DALL-E 3 if not specified
-            final_model_kwargs["size"] = final_model_kwargs.get("size", "1024x1024")
-            final_model_kwargs["quality"] = final_model_kwargs.get("quality", "standard")
-            final_model_kwargs["n"] = final_model_kwargs.get("n", 1)
-            final_model_kwargs["response_format"] = final_model_kwargs.get("response_format", "url")
-
-            # Handle image edits and variations
-            image = final_model_kwargs.get("image")
-            if isinstance(image, str) and os.path.isfile(image):
-                final_model_kwargs["image"] = self._encode_image(image)
             
-            mask = final_model_kwargs.get("mask")
-            if isinstance(mask, str) and os.path.isfile(mask):
-                final_model_kwargs["mask"] = self._encode_image(mask)
+            # Set defaults for image generation
+            if "operation" not in final_model_kwargs:
+                final_model_kwargs["operation"] = "generate"  # Default operation
+            
+            operation = final_model_kwargs.pop("operation")
+            
+            if operation == "generate":
+                # Set defaults for DALL-E 3 if not specified
+                final_model_kwargs["size"] = final_model_kwargs.get("size", "1024x1024")
+                final_model_kwargs["quality"] = final_model_kwargs.get("quality", "standard")
+                final_model_kwargs["n"] = final_model_kwargs.get("n", 1)
+                final_model_kwargs["response_format"] = final_model_kwargs.get("response_format", "url")
+            
+            elif operation in ["edit", "variation"]:
+                if "model" not in final_model_kwargs or final_model_kwargs["model"] != "dall-e-2":
+                    raise ValueError(f"{operation} operation is only available with DALL-E 2")
+                
+                # Handle image input
+                image_path = final_model_kwargs.get("image")
+                if not image_path or not os.path.isfile(image_path):
+                    raise ValueError(f"Valid image path must be provided for {operation}")
+                final_model_kwargs["image"] = open(image_path, "rb")
+                
+                # Handle mask for edit operation
+                if operation == "edit":
+                    mask_path = final_model_kwargs.get("mask")
+                    if not mask_path or not os.path.isfile(mask_path):
+                        raise ValueError("Valid mask path must be provided for edit operation")
+                    final_model_kwargs["mask"] = open(mask_path, "rb")
+                
+                # Set defaults
+                final_model_kwargs["size"] = final_model_kwargs.get("size", "1024x1024")
+                final_model_kwargs["n"] = final_model_kwargs.get("n", 1)
+                final_model_kwargs["response_format"] = final_model_kwargs.get("response_format", "url")
+            
+            else:
+                raise ValueError(f"Invalid operation: {operation}")
         else:
             raise ValueError(f"model_type {model_type} is not supported")
         return final_model_kwargs
@@ -371,18 +406,25 @@ def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINE
                 return self.sync_client.chat.completions.create(**api_kwargs)
             return self.sync_client.chat.completions.create(**api_kwargs)
         elif model_type == ModelType.IMAGE_GENERATION:
-            # Determine which image API to call based on the presence of image/mask
-            if "image" in api_kwargs:
-                if "mask" in api_kwargs:
-                    # Image edit
+            operation = api_kwargs.pop("operation", "generate")
+            
+            try:
+                if operation == "generate":
+                    response = self.sync_client.images.generate(**api_kwargs)
+                elif operation == "edit":
                     response = self.sync_client.images.edit(**api_kwargs)
-                else:
-                    # Image variation
+                elif operation == "variation":
                     response = self.sync_client.images.create_variation(**api_kwargs)
-            else:
-                # Image generation
-                response = self.sync_client.images.generate(**api_kwargs)
-            return response.data
+                else:
+                    raise ValueError(f"Invalid operation: {operation}")
+                
+                return response.data
+            finally:
+                # Clean up file handles if they exist
+                if "image" in api_kwargs and hasattr(api_kwargs["image"], "close"):
+                    api_kwargs["image"].close()
+                if "mask" in api_kwargs and hasattr(api_kwargs["mask"], "close"):
+                    api_kwargs["mask"].close()
         else:
             raise ValueError(f"model_type {model_type} is not supported")
 
@@ -410,18 +452,25 @@ async def acall(
         elif model_type == ModelType.LLM:
             return await self.async_client.chat.completions.create(**api_kwargs)
         elif model_type == ModelType.IMAGE_GENERATION:
-            # Determine which image API to call based on the presence of image/mask
-            if "image" in api_kwargs:
-                if "mask" in api_kwargs:
-                    # Image edit
+            operation = api_kwargs.pop("operation", "generate")
+            
+            try:
+                if operation == "generate":
+                    response = await self.async_client.images.generate(**api_kwargs)
+                elif operation == "edit":
                     response = await self.async_client.images.edit(**api_kwargs)
-                else:
-                    # Image variation
+                elif operation == "variation":
                     response = await self.async_client.images.create_variation(**api_kwargs)
-            else:
-                # Image generation
-                response = await self.async_client.images.generate(**api_kwargs)
-            return response.data
+                else:
+                    raise ValueError(f"Invalid operation: {operation}")
+                
+                return response.data
+            finally:
+                # Clean up file handles if they exist
+                if "image" in api_kwargs and hasattr(api_kwargs["image"], "close"):
+                    api_kwargs["image"].close()
+                if "mask" in api_kwargs and hasattr(api_kwargs["mask"], "close"):
+                    api_kwargs["mask"].close()
         else:
             raise ValueError(f"model_type {model_type} is not supported")
 
diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py
@@ -100,6 +100,8 @@ def __init__(
         # args for the cache
         cache_path: Optional[str] = None,
         use_cache: bool = False,
+        # args for model type
+        model_type: ModelType = ModelType.LLM,
     ) -> None:
         r"""The default prompt is set to the DEFAULT_ADALFLOW_SYSTEM_PROMPT. It has the following variables:
         - task_desc_str
@@ -110,6 +112,17 @@ def __init__(
         - steps_str
         You can preset the prompt kwargs to fill in the variables in the prompt using prompt_kwargs.
         But you can replace the prompt and set any variables you want and use the prompt_kwargs to fill in the variables.
+
+        Args:
+            model_client (ModelClient): The model client to use for the generator.
+            model_kwargs (Dict[str, Any], optional): The model kwargs to pass to the model client. Defaults to {}. Please refer to :ref:`ModelClient<components-model_client>` for the details on how to set the model_kwargs for your specific model if it is from our library.
+            template (Optional[str], optional): The template for the prompt.  Defaults to :ref:`DEFAULT_ADALFLOW_SYSTEM_PROMPT<core-default_prompt_template>`.
+            prompt_kwargs (Optional[Dict], optional): The preset prompt kwargs to fill in the variables in the prompt. Defaults to None.
+            output_processors (Optional[Component], optional):  The output processors after model call. It can be a single component or a chained component via ``Sequential``. Defaults to None.
+            name (Optional[str], optional): The name of the generator. Defaults to None.
+            cache_path (Optional[str], optional): The path to save the cache. Defaults to None.
+            use_cache (bool, optional): Whether to use cache. Defaults to False.
+            model_type (ModelType, optional): The type of model (EMBEDDER, LLM, or IMAGE_GENERATION). Defaults to ModelType.LLM.
         """
 
         if not isinstance(model_client, ModelClient):
@@ -133,6 +146,7 @@ def __init__(
         CallbackManager.__init__(self)
 
         self.name = name or self.__class__.__name__
+        self.model_type = model_type
 
         self._init_prompt(template, prompt_kwargs)
 
@@ -163,6 +177,7 @@ def __init__(
             "name": name,
             "cache_path": cache_path,
             "use_cache": use_cache,
+            "model_type": model_type,
         }
         self._teacher: Optional["Generator"] = None
         self._trace_api_kwargs: Dict[str, Any] = (
diff --git a/tutorials/multimodal_client_testing_examples.py b/tutorials/multimodal_client_testing_examples.py
@@ -23,10 +23,6 @@
 from typing import List
 from numpy.linalg import norm
 
-class ImageGenerator(Generator):
-    """Generator subclass for image generation."""
-    model_type = ModelType.IMAGE_GENERATION
-
 def test_basic_generation():
     """Test basic text generation"""
     client = OpenAIClient()
@@ -61,14 +57,15 @@ def test_invalid_image_url():
 def test_invalid_image_generation():
     """Test DALL-E generation with invalid parameters"""
     client = OpenAIClient()
-    gen = ImageGenerator(
+    gen = Generator(
         model_client=client,
         model_kwargs={
             "model": "dall-e-3",
             "size": "invalid_size",  # Invalid size parameter
             "quality": "standard",
             "n": 1
-        }
+        },
+        model_type=ModelType.IMAGE_GENERATION
     )
     
     print("\n=== Testing Invalid DALL-E Parameters ===")
@@ -94,14 +91,15 @@ def test_vision_and_generation():
     print(f"Description: {vision_response.raw_response}")
 
     # 2. Test DALL-E Image Generation
-    dalle_gen = ImageGenerator(
+    dalle_gen = Generator(
         model_client=client,
         model_kwargs={
             "model": "dall-e-3",
             "size": "1024x1024",
             "quality": "standard",
             "n": 1
-        }
+        },
+        model_type=ModelType.IMAGE_GENERATION
     )
     
     # For image generation, input_str becomes the prompt