feat: openai image generate (#287)

cgoinglove · web-flow · commit 0deef6e8a831 · 2025-10-06T21:55:53.000+09:00
diff --git a/README.md b/README.md
@@ -17,6 +17,7 @@
 
 • **Multi-AI Support** - Integrates all major LLMs: OpenAI, Anthropic, Google, xAI, Ollama, and more  
 • **Powerful Tools** - MCP protocol, web search, JS/Python code execution, data visualization  
+• **Image Generation** - Create and edit images with AI models (OpenAI, Google Gemini, xAI)  
 • **Automation** - Custom agents, visual workflows, artifact generation  
 • **Collaboration** - Share agents, workflows, and MCP configurations with your team  
 • **Voice Assistant** - Realtime voice chat with full MCP tool integration  
@@ -210,13 +211,12 @@ Built-in web search powered by [Exa AI](https://exa.ai). Search the web with sem
 
 #### 🎨 Image Generation
 
-
 <img width="1034" height="940" loading="lazy" alt="image-generation" src="https://github.com/user-attachments/assets/b081c837-8948-4f4d-a2f4-c8630cf0eaa2" />
 
 Built-in image generation and editing capabilities powered by AI models. Create, edit, and modify images directly in your chats.
 
 - **Supported Operations:** Image generation, editing, and composition
-- **Current Models:** Gemini Nano Banana, OpenAI (coming soon)
+- **Current Models:** Gemini Nano Banana, OpenAI
 
 #### ⚡️ JS,PYTHON Executor
 
diff --git a/src/app/api/chat/route.ts b/src/app/api/chat/route.ts
@@ -47,7 +47,7 @@ import {
 import { getSession } from "auth/server";
 import { colorize } from "consola/utils";
 import { generateUUID } from "lib/utils";
-import { nanoBananaTool } from "lib/ai/tools/image";
+import { nanoBananaTool, openaiImageTool } from "lib/ai/tools/image";
 import { ImageToolName } from "lib/ai/tools";
 
 const logger = globalLogger.withDefaults({
@@ -209,7 +209,12 @@ export async function POST(request: Request) {
         );
 
         const IMAGE_TOOL: Record<string, Tool> = useImageTool
-          ? { [ImageToolName]: nanoBananaTool }
+          ? {
+              [ImageToolName]:
+                imageTool?.model === "google"
+                  ? nanoBananaTool
+                  : openaiImageTool,
+            }
           : {};
         const vercelAITooles = safe({
           ...MCP_TOOLS,
diff --git a/src/components/prompt-input.tsx b/src/components/prompt-input.tsx
@@ -317,7 +317,7 @@ export default function PromptInput({
   );
 
   const handleGenerateImage = useCallback(
-    (provider?: "google") => {
+    (provider?: "google" | "openai") => {
       if (!provider) {
         appStoreMutate({
           threadImageToolModel: {},
@@ -450,26 +450,24 @@ export default function PromptInput({
   // Handle ESC key to clear mentions
   useEffect(() => {
     const handleKeyDown = (e: KeyboardEvent) => {
-      if (e.key === "Escape" && mentions.length > 0 && threadId) {
+      if (
+        e.key === "Escape" &&
+        threadId &&
+        (mentions.length > 0 || imageToolModel)
+      ) {
         e.preventDefault();
         e.stopPropagation();
-        appStoreMutate((prev) => ({
-          threadMentions: {
-            ...prev.threadMentions,
-            [threadId]: [],
-          },
+        appStoreMutate(() => ({
+          threadMentions: {},
           agentId: undefined,
+          threadImageToolModel: {},
         }));
         editorRef.current?.commands.focus();
       }
     };
     window.addEventListener("keydown", handleKeyDown);
     return () => window.removeEventListener("keydown", handleKeyDown);
-  }, [mentions.length, threadId, appStoreMutate]);
-
-  useEffect(() => {
-    if (!editorRef.current) return;
-  }, [editorRef.current]);
+  }, [mentions.length, threadId, appStoreMutate, imageToolModel]);
 
   return (
     <div className="max-w-3xl mx-auto fade-in animate-in">
@@ -598,6 +596,14 @@ export default function PromptInput({
                             <GeminiIcon className="mr-2 size-4" />
                             Gemini (Nano Banana)
                           </DropdownMenuItem>
+                          <DropdownMenuItem
+                            disabled={modelInfo?.isToolCallUnsupported}
+                            onClick={() => handleGenerateImage("openai")}
+                            className="cursor-pointer"
+                          >
+                            <OpenAIIcon className="mr-2 size-4" />
+                            OpenAI
+                          </DropdownMenuItem>
                         </DropdownMenuSubContent>
                       </DropdownMenuPortal>
                     </DropdownMenuSub>
@@ -625,6 +631,7 @@ export default function PromptInput({
                         side="top"
                         onSelectWorkflow={onSelectWorkflow}
                         onSelectAgent={onSelectAgent}
+                        onGenerateImage={handleGenerateImage}
                         mentions={mentions}
                       />
                     </>
diff --git a/src/components/tool-invocation/image-generator.tsx b/src/components/tool-invocation/image-generator.tsx
@@ -76,13 +76,12 @@ function PureImageGeneratorToolInvocation({
     return (
       <div className="flex flex-col gap-4">
         <TextShimmer>{getModeText(mode)}</TextShimmer>
-        <div className="w-full h-96 overflow-hidden relative">
+        <div className="w-full h-96 overflow-hidden rounded-lg">
           <LetterGlitch />
-          <div className="z-10 absolute inset-0 w-full h-1/12 bg-gradient-to-b to-90% from-background to-transparent  pointer-events-none" />
-          <div className="z-10 absolute inset-0 w-1/12 h-full bg-gradient-to-r from-background to-transparent  pointer-events-none" />
-          <div className="z-10 absolute left-0 bottom-0 w-full h-1/12 bg-gradient-to-t from-background to-transparent  pointer-events-none" />
-          <div className="z-10 absolute right-0 bottom-0 w-1/12 h-full bg-gradient-to-l from-background to-transparent  pointer-events-none" />
         </div>
+        <p className="text-xs text-muted-foreground text-center">
+          Image generation may take up to 1 minute.
+        </p>
       </div>
     );
   }
diff --git a/src/components/tool-select-dropdown.tsx b/src/components/tool-select-dropdown.tsx
@@ -11,6 +11,7 @@ import {
   CodeIcon,
   GlobeIcon,
   HardDriveUploadIcon,
+  ImagesIcon,
   InfoIcon,
   Loader,
   MessageCircle,
@@ -77,6 +78,9 @@ import { mutate } from "swr";
 import { handleErrorWithToast } from "ui/shared-toast";
 import { useAgents } from "@/hooks/queries/use-agents";
 import { redriectMcpOauth } from "lib/ai/mcp/oauth-redirect";
+import { GeminiIcon } from "ui/gemini-icon";
+import { useChatModels } from "@/hooks/queries/use-chat-models";
+import { OpenAIIcon } from "ui/openai-icon";
 
 interface ToolSelectDropdownProps {
   align?: "start" | "end" | "center";
@@ -85,6 +89,7 @@ interface ToolSelectDropdownProps {
   mentions?: ChatMention[];
   onSelectWorkflow?: (workflow: WorkflowSummary) => void;
   onSelectAgent?: (agent: AgentSummary) => void;
+  onGenerateImage?: (provider?: "google" | "openai") => void;
   className?: string;
 }
 
@@ -103,6 +108,7 @@ export function ToolSelectDropdown({
   side,
   onSelectWorkflow,
   onSelectAgent,
+  onGenerateImage,
   mentions,
   className,
 }: ToolSelectDropdownProps) {
@@ -119,6 +125,18 @@ export function ToolSelectDropdown({
 
   const t = useTranslations("Chat.Tool");
   const { isLoading } = useMcpList();
+  const { data: providers } = useChatModels();
+  const [globalModel] = appStore(useShallow((state) => [state.chatModel]));
+
+  const modelInfo = useMemo(() => {
+    const provider = providers?.find(
+      (provider) => provider.provider === globalModel?.provider,
+    );
+    const model = provider?.models.find(
+      (model) => model.name === globalModel?.model,
+    );
+    return model;
+  }, [providers, globalModel]);
 
   useWorkflowToolList({
     refreshInterval: 1000 * 60 * 5,
@@ -236,6 +254,13 @@ export function ToolSelectDropdown({
         <div className="py-1">
           <DropdownMenuSeparator />
         </div>
+        <ImageGeneratorSelector
+          onGenerateImage={onGenerateImage}
+          modelInfo={modelInfo}
+        />
+        <div className="py-1">
+          <DropdownMenuSeparator />
+        </div>
         <div className="py-2">
           <ToolPresets />
           <div className="py-1">
@@ -1021,3 +1046,44 @@ function AgentSelector({
     </DropdownMenuGroup>
   );
 }
+
+function ImageGeneratorSelector({
+  onGenerateImage,
+  modelInfo,
+}: {
+  onGenerateImage?: (provider?: "google" | "openai") => void;
+  modelInfo?: { isToolCallUnsupported?: boolean };
+}) {
+  const t = useTranslations("Chat");
+
+  return (
+    <DropdownMenuGroup>
+      <DropdownMenuSub>
+        <DropdownMenuSubTrigger className="text-xs flex items-center gap-2 font-semibold cursor-pointer">
+          <ImagesIcon className="size-3.5" />
+          {t("generateImage")}
+        </DropdownMenuSubTrigger>
+        <DropdownMenuPortal>
+          <DropdownMenuSubContent>
+            <DropdownMenuItem
+              disabled={modelInfo?.isToolCallUnsupported}
+              onClick={() => onGenerateImage?.("google")}
+              className="cursor-pointer"
+            >
+              <GeminiIcon className="mr-2 size-4" />
+              Gemini (Nano Banana)
+            </DropdownMenuItem>
+            <DropdownMenuItem
+              disabled={modelInfo?.isToolCallUnsupported}
+              onClick={() => onGenerateImage?.("openai")}
+              className="cursor-pointer"
+            >
+              <OpenAIIcon className="mr-2 size-4" />
+              OpenAI
+            </DropdownMenuItem>
+          </DropdownMenuSubContent>
+        </DropdownMenuPortal>
+      </DropdownMenuSub>
+    </DropdownMenuGroup>
+  );
+}
diff --git a/src/lib/ai/image/generate-image.ts b/src/lib/ai/image/generate-image.ts
@@ -74,9 +74,15 @@ export async function generateImageWithXAI(
 export const generateImageWithNanoBanana = async (
   options: GenerateImageOptions,
 ): Promise<GeneratedImageResult> => {
+  const apiKey = process.env.GOOGLE_GENERATIVE_AI_API_KEY;
+  if (!apiKey) {
+    throw new Error("GOOGLE_GENERATIVE_AI_API_KEY is not set");
+  }
+
   const ai = new GoogleGenAI({
-    apiKey: process.env.GOOGLE_GENERATIVE_AI_API_KEY,
+    apiKey: apiKey,
   });
+
   const geminiMessages: GeminiMessage[] = await safe(options.messages || [])
     .map((messages) => Promise.all(messages.map(convertToGeminiMessage)))
     .watch(watchError(logger.error))
diff --git a/src/lib/ai/tools/image/index.ts b/src/lib/ai/tools/image/index.ts
@@ -1,10 +1,18 @@
-import { FilePart, ModelMessage, ToolResultPart, tool as createTool } from "ai";
+import {
+  FilePart,
+  ImagePart,
+  ModelMessage,
+  ToolResultPart,
+  tool as createTool,
+  generateText,
+} from "ai";
 import { generateImageWithNanoBanana } from "lib/ai/image/generate-image";
 import { serverFileStorage } from "lib/file-storage";
 import { safe, watchError } from "ts-safe";
 import z from "zod";
 import { ImageToolName } from "..";
 import logger from "logger";
+import { openai } from "@ai-sdk/openai";
 
 export type ImageToolResult = {
   images: {
@@ -99,6 +107,97 @@ export const nanoBananaTool = createTool({
   },
 });
 
+export const openaiImageTool = createTool({
+  name: ImageToolName,
+  description: `Generate, edit, or composite images based on the conversation context. This tool automatically analyzes recent messages to create images without requiring explicit input parameters. It includes all user-uploaded images from the recent conversation and only the most recent AI-generated image to avoid confusion. Use the 'mode' parameter to specify the operation type: 'create' for new images, 'edit' for modifying existing images, or 'composite' for combining multiple images. Use this when the user requests image creation, modification, or visual content generation.`,
+  inputSchema: z.object({
+    mode: z
+      .enum(["create", "edit", "composite"])
+      .optional()
+      .default("create")
+      .describe(
+        "Image generation mode: 'create' for new images, 'edit' for modifying existing images, 'composite' for combining multiple images",
+      ),
+  }),
+  execute: async ({ mode }, { messages, abortSignal }) => {
+    const apiKey = process.env.OPENAI_API_KEY;
+    if (!apiKey) {
+      throw new Error("OPENAI_API_KEY is not set");
+    }
+
+    let hasFoundImage = false;
+    const latestMessages = messages
+      .slice(-6)
+      .reverse()
+      .flatMap((m) => {
+        if (m.role != "tool") return m;
+        if (hasFoundImage) return null; // Skip if we already found an image
+        const fileParts = m.content.flatMap(convertToImageToolPartToImagePart);
+        if (fileParts.length === 0) return null;
+        hasFoundImage = true; // Mark that we found the most recent image
+        return [
+          {
+            role: "user",
+            content: fileParts,
+          },
+          m,
+        ] as ModelMessage[];
+      })
+      .filter((v) => Boolean(v?.content?.length))
+      .reverse() as ModelMessage[];
+    const result = await generateText({
+      model: openai("gpt-4.1-mini"),
+      abortSignal,
+      messages: latestMessages,
+      tools: {
+        image_generation: openai.tools.imageGeneration({
+          outputFormat: "webp",
+          model: "gpt-image-1",
+        }),
+      },
+      toolChoice: "required",
+    });
+
+    for (const toolResult of result.staticToolResults) {
+      if (toolResult.toolName === "image_generation") {
+        const base64Image = toolResult.output.result;
+        const uploadedImage = await serverFileStorage
+          .upload(Buffer.from(base64Image, "base64"), {
+            contentType: "image/webp",
+          })
+          .catch(() => {
+            throw new Error(
+              "Image generation was successful, but file upload failed. Please check your file upload configuration and try again.",
+            );
+          });
+        return {
+          images: [{ url: uploadedImage.sourceUrl, mimeType: "image/webp" }],
+          mode,
+          model: "gpt-4.1",
+          guide:
+            "The image has been successfully generated and is now displayed above. If you need any edits, modifications, or adjustments to the image, please let me know.",
+        };
+      }
+    }
+    return {
+      images: [],
+      mode,
+      model: "gpt-4.1",
+      guide: "",
+    };
+  },
+});
+
+function convertToImageToolPartToImagePart(part: ToolResultPart): ImagePart[] {
+  if (part.toolName !== ImageToolName) return [];
+  const result = part.output.value as ImageToolResult;
+  return result.images.map((image) => ({
+    type: "image",
+    image: image.url,
+    mediaType: image.mimeType,
+  }));
+}
+
 function convertToImageToolPartToFilePart(part: ToolResultPart): FilePart[] {
   if (part.toolName !== ImageToolName) return [];
   const result = part.output.value as ImageToolResult;