Skip to content

Commit 5ddd463

Browse files
roomote[bot]roomotedaniel-lxs
authored andcommitted
feat: add optional input image parameter to image generation tool (#7525)
Co-authored-by: Roo Code <[email protected]> Co-authored-by: Daniel Riccio <[email protected]>
1 parent 3f51a57 commit 5ddd463

File tree

5 files changed

+118
-12
lines changed

5 files changed

+118
-12
lines changed

src/api/providers/openrouter.ts

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,9 +275,15 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
275275
* @param prompt The text prompt for image generation
276276
* @param model The model to use for generation
277277
* @param apiKey The OpenRouter API key (must be explicitly provided)
278+
* @param inputImage Optional base64 encoded input image data URL
278279
* @returns The generated image data and format, or an error
279280
*/
280-
async generateImage(prompt: string, model: string, apiKey: string): Promise<ImageGenerationResult> {
281+
async generateImage(
282+
prompt: string,
283+
model: string,
284+
apiKey: string,
285+
inputImage?: string,
286+
): Promise<ImageGenerationResult> {
281287
if (!apiKey) {
282288
return {
283289
success: false,
@@ -299,7 +305,20 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
299305
messages: [
300306
{
301307
role: "user",
302-
content: prompt,
308+
content: inputImage
309+
? [
310+
{
311+
type: "text",
312+
text: prompt,
313+
},
314+
{
315+
type: "image_url",
316+
image_url: {
317+
url: inputImage,
318+
},
319+
},
320+
]
321+
: prompt,
303322
},
304323
],
305324
modalities: ["image", "text"],

src/core/prompts/tools/generate-image.ts

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,35 @@ import { ToolArgs } from "./types"
22

33
export function getGenerateImageDescription(args: ToolArgs): string {
44
return `## generate_image
5-
Description: Request to generate an image using AI models through OpenRouter API. This tool creates images from text prompts and saves them to the specified path.
5+
Description: Request to generate or edit an image using AI models through OpenRouter API. This tool can create new images from text prompts or modify existing images based on your instructions. When an input image is provided, the AI will apply the requested edits, transformations, or enhancements to that image.
66
Parameters:
7-
- prompt: (required) The text prompt describing the image to generate
8-
- path: (required) The file path where the generated image should be saved (relative to the current workspace directory ${args.cwd}). The tool will automatically add the appropriate image extension if not provided.
7+
- prompt: (required) The text prompt describing what to generate or how to edit the image
8+
- path: (required) The file path where the generated/edited image should be saved (relative to the current workspace directory ${args.cwd}). The tool will automatically add the appropriate image extension if not provided.
9+
- image: (optional) The file path to an input image to edit or transform (relative to the current workspace directory ${args.cwd}). Supported formats: PNG, JPG, JPEG, GIF, WEBP.
910
Usage:
1011
<generate_image>
1112
<prompt>Your image description here</prompt>
1213
<path>path/to/save/image.png</path>
14+
<image>path/to/input/image.jpg</image>
1315
</generate_image>
1416
1517
Example: Requesting to generate a sunset image
1618
<generate_image>
1719
<prompt>A beautiful sunset over mountains with vibrant orange and purple colors</prompt>
1820
<path>images/sunset.png</path>
21+
</generate_image>
22+
23+
Example: Editing an existing image
24+
<generate_image>
25+
<prompt>Transform this image into a watercolor painting style</prompt>
26+
<path>images/watercolor-output.png</path>
27+
<image>images/original-photo.jpg</image>
28+
</generate_image>
29+
30+
Example: Upscaling and enhancing an image
31+
<generate_image>
32+
<prompt>Upscale this image to higher resolution, enhance details, improve clarity and sharpness while maintaining the original content and composition</prompt>
33+
<path>images/enhanced-photo.png</path>
34+
<image>images/low-res-photo.jpg</image>
1935
</generate_image>`
2036
}

src/core/tools/__tests__/generateImageTool.test.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,12 @@ describe("generateImageTool", () => {
4646
experiments: {
4747
[EXPERIMENT_IDS.IMAGE_GENERATION]: true,
4848
},
49-
openRouterImageApiKey: "test-api-key",
50-
openRouterImageGenerationSelectedModel: "google/gemini-2.5-flash-image-preview",
49+
apiConfiguration: {
50+
openRouterImageGenerationSettings: {
51+
openRouterApiKey: "test-api-key",
52+
selectedModel: "google/gemini-2.5-flash-image-preview",
53+
},
54+
},
5155
}),
5256
}),
5357
},

src/core/tools/generateImageTool.ts

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ export async function generateImageTool(
2424
) {
2525
const prompt: string | undefined = block.params.prompt
2626
const relPath: string | undefined = block.params.path
27+
const inputImagePath: string | undefined = block.params.image
2728

2829
// Check if the experiment is enabled
2930
const provider = cline.providerRef.deref()
@@ -39,8 +40,7 @@ export async function generateImageTool(
3940
return
4041
}
4142

42-
if (block.partial && (!prompt || !relPath)) {
43-
// Wait for complete parameters
43+
if (block.partial) {
4444
return
4545
}
4646

@@ -66,6 +66,66 @@ export async function generateImageTool(
6666
return
6767
}
6868

69+
// If input image is provided, validate it exists and can be read
70+
let inputImageData: string | undefined
71+
if (inputImagePath) {
72+
const inputImageFullPath = path.resolve(cline.cwd, inputImagePath)
73+
74+
// Check if input image exists
75+
const inputImageExists = await fileExistsAtPath(inputImageFullPath)
76+
if (!inputImageExists) {
77+
await cline.say("error", `Input image not found: ${getReadablePath(cline.cwd, inputImagePath)}`)
78+
pushToolResult(
79+
formatResponse.toolError(`Input image not found: ${getReadablePath(cline.cwd, inputImagePath)}`),
80+
)
81+
return
82+
}
83+
84+
// Validate input image access permissions
85+
const inputImageAccessAllowed = cline.rooIgnoreController?.validateAccess(inputImagePath)
86+
if (!inputImageAccessAllowed) {
87+
await cline.say("rooignore_error", inputImagePath)
88+
pushToolResult(formatResponse.toolError(formatResponse.rooIgnoreError(inputImagePath)))
89+
return
90+
}
91+
92+
// Read the input image file
93+
try {
94+
const imageBuffer = await fs.readFile(inputImageFullPath)
95+
const imageExtension = path.extname(inputImageFullPath).toLowerCase().replace(".", "")
96+
97+
// Validate image format
98+
const supportedFormats = ["png", "jpg", "jpeg", "gif", "webp"]
99+
if (!supportedFormats.includes(imageExtension)) {
100+
await cline.say(
101+
"error",
102+
`Unsupported image format: ${imageExtension}. Supported formats: ${supportedFormats.join(", ")}`,
103+
)
104+
pushToolResult(
105+
formatResponse.toolError(
106+
`Unsupported image format: ${imageExtension}. Supported formats: ${supportedFormats.join(", ")}`,
107+
),
108+
)
109+
return
110+
}
111+
112+
// Convert to base64 data URL
113+
const mimeType = imageExtension === "jpg" ? "jpeg" : imageExtension
114+
inputImageData = `data:image/${mimeType};base64,${imageBuffer.toString("base64")}`
115+
} catch (error) {
116+
await cline.say(
117+
"error",
118+
`Failed to read input image: ${error instanceof Error ? error.message : "Unknown error"}`,
119+
)
120+
pushToolResult(
121+
formatResponse.toolError(
122+
`Failed to read input image: ${error instanceof Error ? error.message : "Unknown error"}`,
123+
),
124+
)
125+
return
126+
}
127+
}
128+
69129
// Check if file is write-protected
70130
const isWriteProtected = cline.rooProtectedController?.isWriteProtected(relPath) || false
71131

@@ -110,6 +170,7 @@ export async function generateImageTool(
110170
const approvalMessage = JSON.stringify({
111171
...sharedMessageProps,
112172
content: prompt,
173+
...(inputImagePath && { inputImage: getReadablePath(cline.cwd, inputImagePath) }),
113174
})
114175

115176
const didApprove = await askApproval("tool", approvalMessage, undefined, isWriteProtected)
@@ -121,8 +182,13 @@ export async function generateImageTool(
121182
// Create a temporary OpenRouter handler with minimal options
122183
const openRouterHandler = new OpenRouterHandler({} as any)
123184

124-
// Call the generateImage method with the explicit API key
125-
const result = await openRouterHandler.generateImage(prompt, selectedModel, openRouterApiKey)
185+
// Call the generateImage method with the explicit API key and optional input image
186+
const result = await openRouterHandler.generateImage(
187+
prompt,
188+
selectedModel,
189+
openRouterApiKey,
190+
inputImageData,
191+
)
126192

127193
if (!result.success) {
128194
await cline.say("error", result.error || "Failed to generate image")

src/shared/tools.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ export const toolParamNames = [
6666
"args",
6767
"todos",
6868
"prompt",
69+
"image",
6970
] as const
7071

7172
export type ToolParamName = (typeof toolParamNames)[number]
@@ -167,7 +168,7 @@ export interface SearchAndReplaceToolUse extends ToolUse {
167168

168169
export interface GenerateImageToolUse extends ToolUse {
169170
name: "generate_image"
170-
params: Partial<Pick<Record<ToolParamName, string>, "prompt" | "path">>
171+
params: Partial<Pick<Record<ToolParamName, string>, "prompt" | "path" | "image">>
171172
}
172173

173174
// Define tool group configuration

0 commit comments

Comments
 (0)