Skip to content

Commit 9ed3d53

Browse files
committed
feat: add image generation tool with OpenRouter integration
- Add experimental image generation feature using OpenRouter API - Implement generate_image tool for AI-driven image creation - Add ImageViewer component with zoom, copy, and save functionality - Create settings UI for API key configuration and model selection - Integrate with approval system and auto-approval for write permissions - Add collapsible approval dialog matching existing tool patterns - Support for multiple image generation models (starting with Gemini 2.5 Flash) - Add i18n support with proper translation strings - Respect file protection and workspace boundaries - Display generated images inline in chat with rich controls
1 parent d4a16f4 commit 9ed3d53

39 files changed

+893
-19
lines changed

packages/types/src/experiment.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@ import type { Keys, Equals, AssertEqual } from "./type-fu.js"
66
* ExperimentId
77
*/
88

9-
export const experimentIds = ["powerSteering", "multiFileApplyDiff", "preventFocusDisruption"] as const
9+
export const experimentIds = [
10+
"powerSteering",
11+
"multiFileApplyDiff",
12+
"preventFocusDisruption",
13+
"imageGeneration",
14+
] as const
1015

1116
export const experimentIdsSchema = z.enum(experimentIds)
1217

@@ -20,6 +25,7 @@ export const experimentsSchema = z.object({
2025
powerSteering: z.boolean().optional(),
2126
multiFileApplyDiff: z.boolean().optional(),
2227
preventFocusDisruption: z.boolean().optional(),
28+
imageGeneration: z.boolean().optional(),
2329
})
2430

2531
export type Experiments = z.infer<typeof experimentsSchema>

packages/types/src/provider-settings.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,14 @@ const baseProviderSettingsSchema = z.object({
112112

113113
// Model verbosity.
114114
verbosity: verbosityLevelsSchema.optional(),
115+
116+
// Image generation settings (experimental)
117+
imageGenerationSettings: z
118+
.object({
119+
openRouterApiKey: z.string().optional(),
120+
selectedModel: z.string().optional(),
121+
})
122+
.optional(),
115123
})
116124

117125
// Several of the providers share common model config properties.

packages/types/src/tool.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ export const toolNames = [
3434
"fetch_instructions",
3535
"codebase_search",
3636
"update_todo_list",
37+
"generate_image",
3738
] as const
3839

3940
export const toolNamesSchema = z.enum(toolNames)

src/core/assistant-message/presentAssistantMessage.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import { attemptCompletionTool } from "../tools/attemptCompletionTool"
2828
import { newTaskTool } from "../tools/newTaskTool"
2929

3030
import { updateTodoListTool } from "../tools/updateTodoListTool"
31+
import { generateImageTool } from "../tools/generateImageTool"
3132

3233
import { formatResponse } from "../prompts/responses"
3334
import { validateToolUse } from "../tools/validateToolUse"
@@ -221,6 +222,8 @@ export async function presentAssistantMessage(cline: Task) {
221222
const modeName = getModeBySlug(mode, customModes)?.name ?? mode
222223
return `[${block.name} in ${modeName} mode: '${message}']`
223224
}
225+
case "generate_image":
226+
return `[${block.name} for '${block.params.path}']`
224227
}
225228
}
226229

@@ -546,6 +549,9 @@ export async function presentAssistantMessage(cline: Task) {
546549
askFinishSubTaskApproval,
547550
)
548551
break
552+
case "generate_image":
553+
await generateImageTool(cline, block, askApproval, handleError, pushToolResult, removeClosingTag)
554+
break
549555
}
550556

551557
break
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
import { ToolArgs } from "./types"
2+
3+
export function getGenerateImageDescription(args: ToolArgs): string {
4+
return `## generate_image
5+
Description: Request to generate an image using AI models through OpenRouter API. This tool creates images from text prompts and saves them to the specified path. Requires OpenRouter API key to be configured in experimental settings.
6+
Parameters:
7+
- prompt: (required) The text prompt describing the image to generate
8+
- path: (required) The file path where the generated image should be saved (relative to the current workspace directory ${args.cwd}). The tool will automatically add the appropriate image extension if not provided.
9+
Usage:
10+
<generate_image>
11+
<prompt>Your image description here</prompt>
12+
<path>path/to/save/image.png</path>
13+
</generate_image>
14+
15+
Example: Requesting to generate a sunset image
16+
<generate_image>
17+
<prompt>A beautiful sunset over mountains with vibrant orange and purple colors</prompt>
18+
<path>images/sunset.png</path>
19+
</generate_image>`
20+
}

src/core/prompts/tools/index.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import { getSwitchModeDescription } from "./switch-mode"
2525
import { getNewTaskDescription } from "./new-task"
2626
import { getCodebaseSearchDescription } from "./codebase-search"
2727
import { getUpdateTodoListDescription } from "./update-todo-list"
28+
import { getGenerateImageDescription } from "./generate-image"
2829
import { CodeIndexManager } from "../../../services/code-index/manager"
2930

3031
// Map of tool names to their description functions
@@ -56,6 +57,7 @@ const toolDescriptionMap: Record<string, (args: ToolArgs) => string | undefined>
5657
apply_diff: (args) =>
5758
args.diffStrategy ? args.diffStrategy.getToolDescription({ cwd: args.cwd, toolOptions: args.toolOptions }) : "",
5859
update_todo_list: (args) => getUpdateTodoListDescription(args),
60+
generate_image: (args) => getGenerateImageDescription(args),
5961
}
6062

6163
export function getToolDescriptionsForMode(
@@ -129,6 +131,11 @@ export function getToolDescriptionsForMode(
129131
tools.delete("update_todo_list")
130132
}
131133

134+
// Conditionally exclude generate_image if experiment is not enabled
135+
if (!experiments?.imageGeneration) {
136+
tools.delete("generate_image")
137+
}
138+
132139
// Map tool descriptions for allowed tools
133140
const descriptions = Array.from(tools).map((toolName) => {
134141
const descriptionFn = toolDescriptionMap[toolName]
@@ -164,4 +171,5 @@ export {
164171
getInsertContentDescription,
165172
getSearchAndReplaceDescription,
166173
getCodebaseSearchDescription,
174+
getGenerateImageDescription,
167175
}
Lines changed: 264 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,264 @@
1+
import path from "path"
2+
import fs from "fs/promises"
3+
import * as vscode from "vscode"
4+
import { Task } from "../task/Task"
5+
import { formatResponse } from "../prompts/responses"
6+
import { ToolUse, AskApproval, HandleError, PushToolResult, RemoveClosingTag } from "../../shared/tools"
7+
import { fileExistsAtPath } from "../../utils/fs"
8+
import { getReadablePath } from "../../utils/path"
9+
import { isPathOutsideWorkspace } from "../../utils/pathUtils"
10+
import { EXPERIMENT_IDS, experiments } from "../../shared/experiments"
11+
import { safeWriteJson } from "../../utils/safeWriteJson"
12+
13+
// Hardcoded list of image generation models for now
14+
const IMAGE_GENERATION_MODELS = [
15+
"google/gemini-2.5-flash-image-preview",
16+
// Add more models as they become available
17+
]
18+
19+
interface ImageGenerationResponse {
20+
choices?: Array<{
21+
message?: {
22+
content?: string
23+
images?: Array<{
24+
type?: string
25+
image_url?: {
26+
url?: string
27+
}
28+
}>
29+
}
30+
}>
31+
error?: {
32+
message?: string
33+
type?: string
34+
code?: string
35+
}
36+
}
37+
38+
export async function generateImageTool(
39+
cline: Task,
40+
block: ToolUse,
41+
askApproval: AskApproval,
42+
handleError: HandleError,
43+
pushToolResult: PushToolResult,
44+
removeClosingTag: RemoveClosingTag,
45+
) {
46+
const prompt: string | undefined = block.params.prompt
47+
const relPath: string | undefined = block.params.path
48+
49+
// Check if the experiment is enabled
50+
const provider = cline.providerRef.deref()
51+
const state = await provider?.getState()
52+
const isImageGenerationEnabled = experiments.isEnabled(state?.experiments ?? {}, EXPERIMENT_IDS.IMAGE_GENERATION)
53+
54+
if (!isImageGenerationEnabled) {
55+
pushToolResult(
56+
formatResponse.toolError(
57+
"Image generation is an experimental feature that must be enabled in settings. Please enable 'Image Generation' in the Experimental Settings section.",
58+
),
59+
)
60+
return
61+
}
62+
63+
if (block.partial && (!prompt || !relPath)) {
64+
// Wait for complete parameters
65+
return
66+
}
67+
68+
if (!prompt) {
69+
cline.consecutiveMistakeCount++
70+
cline.recordToolError("generate_image")
71+
pushToolResult(await cline.sayAndCreateMissingParamError("generate_image", "prompt"))
72+
return
73+
}
74+
75+
if (!relPath) {
76+
cline.consecutiveMistakeCount++
77+
cline.recordToolError("generate_image")
78+
pushToolResult(await cline.sayAndCreateMissingParamError("generate_image", "path"))
79+
return
80+
}
81+
82+
// Validate access permissions
83+
const accessAllowed = cline.rooIgnoreController?.validateAccess(relPath)
84+
if (!accessAllowed) {
85+
await cline.say("rooignore_error", relPath)
86+
pushToolResult(formatResponse.toolError(formatResponse.rooIgnoreError(relPath)))
87+
return
88+
}
89+
90+
// Check if file is write-protected
91+
const isWriteProtected = cline.rooProtectedController?.isWriteProtected(relPath) || false
92+
93+
// Get OpenRouter API key from settings or profile
94+
const imageGenerationSettings = (state as any)?.imageGenerationSettings
95+
let openRouterApiKey = imageGenerationSettings?.openRouterApiKey
96+
97+
// If no API key in settings, check profiles for openRouterApiKey
98+
if (!openRouterApiKey) {
99+
// Check the current API configuration for OpenRouter key
100+
const currentApiConfig = state?.apiConfiguration
101+
if (currentApiConfig?.openRouterApiKey) {
102+
openRouterApiKey = currentApiConfig.openRouterApiKey
103+
}
104+
}
105+
106+
if (!openRouterApiKey) {
107+
await cline.say(
108+
"error",
109+
"OpenRouter API key is required for image generation. Please configure it in the Image Generation experimental settings or use a profile with an OpenRouter API key.",
110+
)
111+
pushToolResult(
112+
formatResponse.toolError(
113+
"OpenRouter API key is required for image generation. Please configure it in the Image Generation experimental settings or use a profile with an OpenRouter API key.",
114+
),
115+
)
116+
return
117+
}
118+
119+
// Get selected model from settings or use default
120+
const selectedModel = imageGenerationSettings?.selectedModel || IMAGE_GENERATION_MODELS[0]
121+
122+
// Determine if the path is outside the workspace
123+
const fullPath = path.resolve(cline.cwd, removeClosingTag("path", relPath))
124+
const isOutsideWorkspace = isPathOutsideWorkspace(fullPath)
125+
126+
const sharedMessageProps = {
127+
tool: "generateImage" as const,
128+
path: getReadablePath(cline.cwd, removeClosingTag("path", relPath)),
129+
content: prompt,
130+
isOutsideWorkspace,
131+
isProtected: isWriteProtected,
132+
}
133+
134+
try {
135+
if (!block.partial) {
136+
cline.consecutiveMistakeCount = 0
137+
138+
// Ask for approval before generating the image
139+
const approvalMessage = JSON.stringify({
140+
...sharedMessageProps,
141+
content: prompt,
142+
})
143+
144+
const didApprove = await askApproval("tool", approvalMessage, undefined, isWriteProtected)
145+
146+
if (!didApprove) {
147+
return
148+
}
149+
150+
// Call OpenRouter API to generate image
151+
const response = await fetch("https://openrouter.ai/api/v1/chat/completions", {
152+
method: "POST",
153+
headers: {
154+
Authorization: `Bearer ${openRouterApiKey}`,
155+
"Content-Type": "application/json",
156+
"HTTP-Referer": "https://github.com/RooVetGit/Roo-Code",
157+
"X-Title": "Roo Code",
158+
},
159+
body: JSON.stringify({
160+
model: selectedModel,
161+
messages: [
162+
{
163+
role: "user",
164+
content: prompt,
165+
},
166+
],
167+
modalities: ["image", "text"],
168+
}),
169+
})
170+
171+
if (!response.ok) {
172+
const errorText = await response.text()
173+
let errorMessage = `Failed to generate image: ${response.status} ${response.statusText}`
174+
try {
175+
const errorJson = JSON.parse(errorText)
176+
if (errorJson.error?.message) {
177+
errorMessage = `Failed to generate image: ${errorJson.error.message}`
178+
}
179+
} catch {
180+
// Use default error message
181+
}
182+
await cline.say("error", errorMessage)
183+
pushToolResult(formatResponse.toolError(errorMessage))
184+
return
185+
}
186+
187+
const result: ImageGenerationResponse = await response.json()
188+
189+
if (result.error) {
190+
const errorMessage = `Failed to generate image: ${result.error.message}`
191+
await cline.say("error", errorMessage)
192+
pushToolResult(formatResponse.toolError(errorMessage))
193+
return
194+
}
195+
196+
// Extract the generated image from the response
197+
const images = result.choices?.[0]?.message?.images
198+
if (!images || images.length === 0) {
199+
const errorMessage = "No image was generated in the response"
200+
await cline.say("error", errorMessage)
201+
pushToolResult(formatResponse.toolError(errorMessage))
202+
return
203+
}
204+
205+
const imageData = images[0]?.image_url?.url
206+
if (!imageData) {
207+
const errorMessage = "Invalid image data in response"
208+
await cline.say("error", errorMessage)
209+
pushToolResult(formatResponse.toolError(errorMessage))
210+
return
211+
}
212+
213+
// Extract base64 data from data URL
214+
const base64Match = imageData.match(/^data:image\/(png|jpeg|jpg);base64,(.+)$/)
215+
if (!base64Match) {
216+
const errorMessage = "Invalid image format received"
217+
await cline.say("error", errorMessage)
218+
pushToolResult(formatResponse.toolError(errorMessage))
219+
return
220+
}
221+
222+
const imageFormat = base64Match[1]
223+
const base64Data = base64Match[2]
224+
225+
// Ensure the file has the correct extension
226+
let finalPath = relPath
227+
if (!finalPath.match(/\.(png|jpg|jpeg)$/i)) {
228+
finalPath = `${finalPath}.${imageFormat === "jpeg" ? "jpg" : imageFormat}`
229+
}
230+
231+
// Convert base64 to buffer
232+
const imageBuffer = Buffer.from(base64Data, "base64")
233+
234+
// Create directory if it doesn't exist
235+
const absolutePath = path.resolve(cline.cwd, finalPath)
236+
const directory = path.dirname(absolutePath)
237+
await fs.mkdir(directory, { recursive: true })
238+
239+
// Write the image file
240+
await fs.writeFile(absolutePath, imageBuffer)
241+
242+
// Track file creation
243+
if (finalPath) {
244+
await cline.fileContextTracker.trackFileContext(finalPath, "roo_edited")
245+
}
246+
247+
cline.didEditFile = true
248+
249+
// Display the generated image in the chat using a text message with the image
250+
await cline.say("text", `Image generated and saved to: ${getReadablePath(cline.cwd, finalPath)}`, [
251+
imageData,
252+
])
253+
254+
pushToolResult(
255+
formatResponse.toolResult(`Image created successfully at ${getReadablePath(cline.cwd, finalPath)}`),
256+
)
257+
258+
return
259+
}
260+
} catch (error) {
261+
await handleError("generating image", error)
262+
return
263+
}
264+
}

0 commit comments

Comments
 (0)