From 6aeb4016e116364bb0c3c815afdfc37f8a39d8f9 Mon Sep 17 00:00:00 2001 From: daffaalex22 Date: Fri, 26 Dec 2025 20:12:02 +0700 Subject: [PATCH] feat(api): add image-to-image generation tool Add a new `image_to_image` function that enables image-to-image editing by combining a reference image with a text prompt. This allows for consistent character appearance or style across multiple images. The implementation includes: - Reference image processing (local files, URLs, or base64) - Subject reference payload construction - Image generation and saving logic - Cost warning in tool description Also update .gitignore to exclude .vscode/mcp.json --- .gitignore | 1 + minimax_mcp/server.py | 126 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 125 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index a12dd70..ae05bb0 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,7 @@ .venv .cursor .cursorignore +.vscode/mcp.json dist/ minimax_mcp.egg-info/ .coverage diff --git a/minimax_mcp/server.py b/minimax_mcp/server.py index c767441..817ccc8 100644 --- a/minimax_mcp/server.py +++ b/minimax_mcp/server.py @@ -571,6 +571,128 @@ def text_to_image( text=f"Failed to save images: {str(e)}" ) +@mcp.tool( + description="""Generate an image using a reference image and a text prompt (image-to-image generation). + + This tool enables image-to-image editing where users provide a reference image and a text prompt + to generate a new image that incorporates elements from the reference image while following the prompt. + This is useful for maintaining consistent character appearance or style across multiple images. + + COST WARNING: This tool makes an API call to Minimax which may incur costs. Only use when explicitly requested by the user. + + Args: + prompt (str, required): Text description of the desired output image. Max length 1500 characters. + Example: "A woman in a red dress walking in a garden" + reference_image (str, required): The reference image to use for generation. Can be: + - A local file path (e.g., "/path/to/image.jpg") + - A URL (e.g., "https://example.com/image.jpg") + - A base64 data URL (e.g., "data:image/jpeg;base64,...") + model (str, optional): The model to use. Values: ["image-01"]. Defaults to "image-01". + aspect_ratio (str, optional): The aspect ratio of the output image. + Values: ["1:1", "16:9", "4:3", "3:2", "2:3", "3:4", "9:16", "21:9"]. Defaults to "1:1". + n (int, optional): The number of images to generate. Values: [1-9]. Defaults to 1. + prompt_optimizer (bool, optional): Whether to optimize the prompt. Defaults to True. + output_directory (str, optional): The directory to save the generated image(s) to. + + Returns: + Text content with the path(s) to the output image file(s) or URLs depending on resource mode. + """ +) +def image_to_image( + prompt: str = "", + reference_image: str = "", + model: str = DEFAULT_T2I_MODEL, + aspect_ratio: str = "1:1", + n: int = 1, + prompt_optimizer: bool = True, + output_directory: str = None, +): + try: + if not prompt: + raise MinimaxRequestError("Prompt is required") + if not reference_image: + raise MinimaxRequestError("Reference image is required") + + # Process reference image - convert local file to base64 data URL if needed + processed_image = reference_image + if not isinstance(reference_image, str): + raise MinimaxRequestError(f"Reference image must be a string, got {type(reference_image)}") + + if not reference_image.startswith(("http://", "https://", "data:")): + # Local file path - convert to base64 data URL + if not os.path.exists(reference_image): + raise MinimaxRequestError(f"Reference image file does not exist: {reference_image}") + with open(reference_image, "rb") as f: + image_data = f.read() + # Detect image type from extension + ext = os.path.splitext(reference_image)[1].lower() + mime_type = "image/jpeg" # default + if ext == ".png": + mime_type = "image/png" + elif ext == ".gif": + mime_type = "image/gif" + elif ext == ".webp": + mime_type = "image/webp" + processed_image = f"data:{mime_type};base64,{base64.b64encode(image_data).decode('utf-8')}" + + # Build payload with subject_reference + payload = { + "model": model, + "prompt": prompt, + "aspect_ratio": aspect_ratio, + "n": n, + "prompt_optimizer": prompt_optimizer, + "subject_reference": [ + { + "type": "character", + "image_file": processed_image + } + ] + } + + response_data = api_client.post("/v1/image_generation", json=payload) + image_urls = response_data.get("data", {}).get("image_urls", []) + + if not image_urls: + raise MinimaxRequestError("No images generated") + + if resource_mode == RESOURCE_MODE_URL: + return TextContent( + type="text", + text=f"Success. Image URLs: {image_urls}" + ) + + output_path = build_output_path(output_directory, base_path) + output_file_names = [] + + for i, image_url in enumerate(image_urls): + output_file_name = build_output_file("image", f"i2i_{i}_{prompt[:30]}", output_path, "jpg") + output_path.parent.mkdir(parents=True, exist_ok=True) + + image_response = requests.get(image_url) + image_response.raise_for_status() + + with open(output_file_name, 'wb') as f: + f.write(image_response.content) + output_file_names.append(output_file_name) + + return TextContent( + type="text", + text=f"Success. Images saved as: {output_file_names}" + ) + + except MinimaxAPIError as e: + return TextContent( + type="text", + text=f"Failed to generate images: {str(e)}" + ) + except (IOError, requests.RequestException) as e: + return TextContent( + type="text", + text=f"Failed to save images: {str(e)}" + ) + + @mcp.tool( description="""Create a music generation task using AI models. Generate music from prompt and lyrics. @@ -580,14 +702,14 @@ def text_to_image( prompt (str): Music creation inspiration describing style, mood, scene, etc. Example: "Pop music, sad, suitable for rainy nights". Character range: [10, 300] lyrics (str): Song lyrics for music generation. - Use newline (\\n) to separate each line of lyrics. Supports lyric structure tags [Intro][Verse][Chorus][Bridge][Outro] + Use newline (\\n) to separate each line of lyrics. Supports lyric structure tags [Intro][Verse][Chorus][Bridge][Outro] to enhance musicality. Character range: [10, 600] (each Chinese character, punctuation, and letter counts as 1 character) stream (bool, optional): Whether to enable streaming mode. Defaults to False sample_rate (int, optional): Sample rate of generated music. Values: [16000, 24000, 32000, 44100] bitrate (int, optional): Bitrate of generated music. Values: [32000, 64000, 128000, 256000] format (str, optional): Format of generated music. Values: ["mp3", "wav", "pcm"]. Defaults to "mp3" output_directory (str, optional): Directory to save the generated music file - + Note: Currently supports generating music up to 1 minute in length. Returns: