Skip to content

Commit 68a7c83

Browse files
daniel-lxshannesrudolph
authored andcommitted
feat: add Gemini provider support for image generation
- Create centralized image generation models configuration in types package - Implement generateImage() method in Gemini provider for text-to-image and image-to-image - Add provider dropdown in UI settings to switch between OpenRouter and Gemini - Update generateImageTool to route requests to appropriate provider - Add settings persistence for Gemini API key and model selection - Translate all new UI strings across 18 supported languages - Fix settings state management to properly handle new image generation settings
1 parent a8f87d2 commit 68a7c83

30 files changed

+654
-90
lines changed

packages/types/src/global-settings.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,11 @@ export const globalSettingsSchema = z.object({
4444
dismissedUpsells: z.array(z.string()).optional(),
4545

4646
// Image generation settings (experimental) - flattened for simplicity
47+
imageGenerationProvider: z.enum(["openrouter", "gemini"]).optional(),
4748
openRouterImageApiKey: z.string().optional(),
4849
openRouterImageGenerationSelectedModel: z.string().optional(),
50+
geminiImageApiKey: z.string().optional(),
51+
geminiImageGenerationSelectedModel: z.string().optional(),
4952

5053
condensingApiConfigId: z.string().optional(),
5154
customCondensingPrompt: z.string().optional(),
@@ -210,7 +213,8 @@ export const SECRET_STATE_KEYS = [
210213

211214
// Global secrets that are part of GlobalSettings (not ProviderSettings)
212215
export const GLOBAL_SECRET_KEYS = [
213-
"openRouterImageApiKey", // For image generation
216+
"openRouterImageApiKey", // For OpenRouter image generation
217+
"geminiImageApiKey", // For Gemini image generation
214218
] as const
215219

216220
// Type for the actual secret storage keys
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import { z } from "zod"
2+
3+
/**
4+
* Image Generation Provider
5+
*/
6+
export const imageGenerationProviders = ["openrouter", "gemini"] as const
7+
export const imageGenerationProviderSchema = z.enum(imageGenerationProviders)
8+
export type ImageGenerationProvider = z.infer<typeof imageGenerationProviderSchema>
9+
10+
/**
11+
* Image Generation Model Info
12+
*/
13+
export interface ImageGenerationModelInfo {
14+
provider: ImageGenerationProvider
15+
modelId: string
16+
label: string
17+
supportsEditMode?: boolean // Whether the model supports image editing (text + image input)
18+
maxInputSize?: number // Maximum input image size in MB
19+
outputFormats?: string[] // Supported output formats
20+
}
21+
22+
/**
23+
* Image Generation Models by Provider
24+
*/
25+
export const IMAGE_GENERATION_MODELS: Record<ImageGenerationProvider, ImageGenerationModelInfo[]> = {
26+
openrouter: [
27+
{
28+
provider: "openrouter",
29+
modelId: "google/gemini-2.5-flash-image-preview",
30+
label: "Gemini 2.5 Flash Image Preview",
31+
supportsEditMode: true,
32+
outputFormats: ["png", "jpeg"],
33+
},
34+
{
35+
provider: "openrouter",
36+
modelId: "google/gemini-2.5-flash-image-preview:free",
37+
label: "Gemini 2.5 Flash Image Preview (Free)",
38+
supportsEditMode: true,
39+
outputFormats: ["png", "jpeg"],
40+
},
41+
],
42+
gemini: [
43+
{
44+
provider: "gemini",
45+
modelId: "gemini-2.5-flash-image-preview",
46+
label: "Gemini 2.5 Flash Image Preview",
47+
supportsEditMode: true,
48+
outputFormats: ["png", "jpeg"],
49+
},
50+
],
51+
}
52+
53+
/**
54+
* Helper function to get all models for a specific provider
55+
*/
56+
export function getImageGenerationModelsForProvider(provider: ImageGenerationProvider): ImageGenerationModelInfo[] {
57+
return IMAGE_GENERATION_MODELS[provider] || []
58+
}
59+
60+
/**
61+
* Helper function to get all available image generation models
62+
*/
63+
export function getAllImageGenerationModels(): ImageGenerationModelInfo[] {
64+
return Object.values(IMAGE_GENERATION_MODELS).flat()
65+
}
66+
67+
/**
68+
* Image Generation Result
69+
*/
70+
export interface ImageGenerationResult {
71+
success: boolean
72+
imageData?: string // Base64 encoded image data URL
73+
imageFormat?: string // Format of the generated image (png, jpeg, etc.)
74+
error?: string
75+
}

packages/types/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ export * from "./experiment.js"
77
export * from "./followup.js"
88
export * from "./global-settings.js"
99
export * from "./history.js"
10+
export * from "./image-generation.js"
1011
export * from "./ipc.js"
1112
export * from "./marketplace.js"
1213
export * from "./mcp.js"

src/api/providers/gemini.ts

Lines changed: 142 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,13 @@ import {
88
} from "@google/genai"
99
import type { JWTInput } from "google-auth-library"
1010

11-
import { type ModelInfo, type GeminiModelId, geminiDefaultModelId, geminiModels } from "@roo-code/types"
11+
import {
12+
type ModelInfo,
13+
type GeminiModelId,
14+
geminiDefaultModelId,
15+
geminiModels,
16+
type ImageGenerationResult,
17+
} from "@roo-code/types"
1218

1319
import type { ApiHandlerOptions } from "../../shared/api"
1420
import { safeJsonParse } from "../../shared/safeJsonParse"
@@ -335,4 +341,139 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
335341

336342
return totalCost
337343
}
344+
345+
/**
346+
* Generate an image using Gemini's image generation API
347+
* @param prompt The text prompt for image generation
348+
* @param model The model to use for generation
349+
* @param apiKey The Gemini API key (if not using vertex)
350+
* @param inputImage Optional base64 encoded input image data URL for editing
351+
* @returns The generated image data and format, or an error
352+
*/
353+
async generateImage(
354+
prompt: string,
355+
model: string,
356+
apiKey?: string,
357+
inputImage?: string,
358+
): Promise<ImageGenerationResult> {
359+
try {
360+
// Create a temporary client with the provided API key if needed
361+
let client: GoogleGenAI
362+
if (apiKey && !this.options.vertexProjectId) {
363+
// Use provided API key for standard Gemini
364+
client = new GoogleGenAI({ apiKey })
365+
} else {
366+
// Use existing client (either vertex or standard with already configured key)
367+
client = this.client
368+
}
369+
370+
// Prepare the content for generation
371+
const contents: any[] = []
372+
373+
if (inputImage) {
374+
// For image editing mode, include both text and image
375+
const base64Match = inputImage.match(/^data:image\/(png|jpeg|jpg);base64,(.+)$/)
376+
if (!base64Match) {
377+
return {
378+
success: false,
379+
error: "Invalid input image format. Expected base64 data URL.",
380+
}
381+
}
382+
383+
const mimeType = base64Match[1] === "jpg" ? "image/jpeg" : `image/${base64Match[1]}`
384+
const base64Data = base64Match[2]
385+
386+
contents.push({
387+
role: "user",
388+
parts: [
389+
{ text: prompt },
390+
{
391+
inlineData: {
392+
mimeType,
393+
data: base64Data,
394+
},
395+
},
396+
],
397+
})
398+
} else {
399+
// For text-to-image mode
400+
contents.push({
401+
role: "user",
402+
parts: [{ text: prompt }],
403+
})
404+
}
405+
406+
const config: GenerateContentConfig = {
407+
httpOptions: this.options.googleGeminiBaseUrl
408+
? { baseUrl: this.options.googleGeminiBaseUrl }
409+
: undefined,
410+
temperature: 1.0, // Higher temperature for more creative image generation
411+
}
412+
413+
const params: GenerateContentParameters = {
414+
model,
415+
contents,
416+
config,
417+
}
418+
419+
const result = await client.models.generateContent(params)
420+
421+
// Extract the generated image from the response
422+
if (!result.candidates || result.candidates.length === 0) {
423+
return {
424+
success: false,
425+
error: "No candidates returned in the response",
426+
}
427+
}
428+
429+
const candidate = result.candidates[0]
430+
if (!candidate.content || !candidate.content.parts) {
431+
return {
432+
success: false,
433+
error: "No content parts in the response",
434+
}
435+
}
436+
437+
// Find the image part in the response
438+
let imageData: string | undefined
439+
let imageFormat = "png" // Default format
440+
441+
for (const part of candidate.content.parts) {
442+
if (part.inlineData) {
443+
const mimeType = part.inlineData.mimeType
444+
const data = part.inlineData.data
445+
446+
if (mimeType?.startsWith("image/")) {
447+
// Extract format from mime type
448+
imageFormat = mimeType.replace("image/", "").replace("jpeg", "jpg")
449+
450+
// Convert to data URL format
451+
imageData = `data:${mimeType};base64,${data}`
452+
break
453+
}
454+
}
455+
}
456+
457+
if (!imageData) {
458+
return {
459+
success: false,
460+
error: "No image data found in the response",
461+
}
462+
}
463+
464+
return {
465+
success: true,
466+
imageData,
467+
imageFormat,
468+
}
469+
} catch (error) {
470+
const errorMessage = error instanceof Error ? error.message : "Unknown error occurred"
471+
console.error("Gemini image generation error:", errorMessage)
472+
473+
return {
474+
success: false,
475+
error: `Failed to generate image: ${errorMessage}`,
476+
}
477+
}
478+
}
338479
}

src/core/tools/generateImageTool.ts

Lines changed: 73 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,8 @@ import { getReadablePath } from "../../utils/path"
99
import { isPathOutsideWorkspace } from "../../utils/pathUtils"
1010
import { EXPERIMENT_IDS, experiments } from "../../shared/experiments"
1111
import { OpenRouterHandler } from "../../api/providers/openrouter"
12-
13-
// Hardcoded list of image generation models for now
14-
const IMAGE_GENERATION_MODELS = ["google/gemini-2.5-flash-image", "openai/gpt-5-image", "openai/gpt-5-image-mini"]
12+
import { GeminiHandler } from "../../api/providers/gemini"
13+
import { ImageGenerationProvider, getImageGenerationModelsForProvider } from "@roo-code/types"
1514

1615
export async function generateImageTool(
1716
cline: Task,
@@ -128,25 +127,60 @@ export async function generateImageTool(
128127
// Check if file is write-protected
129128
const isWriteProtected = cline.rooProtectedController?.isWriteProtected(relPath) || false
130129

131-
// Get OpenRouter API key from global settings (experimental image generation)
132-
const openRouterApiKey = state?.openRouterImageApiKey
130+
// Get the selected provider from settings (default to openrouter)
131+
const selectedProvider = (state?.imageGenerationProvider || "openrouter") as ImageGenerationProvider
133132

134-
if (!openRouterApiKey) {
135-
await cline.say(
136-
"error",
137-
"OpenRouter API key is required for image generation. Please configure it in the Image Generation experimental settings.",
138-
)
139-
pushToolResult(
140-
formatResponse.toolError(
133+
// Get selected model from settings based on provider
134+
let selectedModel: string
135+
let apiKey: string | undefined
136+
137+
if (selectedProvider === "openrouter") {
138+
apiKey = state?.openRouterImageApiKey
139+
if (!apiKey) {
140+
await cline.say(
141+
"error",
141142
"OpenRouter API key is required for image generation. Please configure it in the Image Generation experimental settings.",
142-
),
143-
)
143+
)
144+
pushToolResult(
145+
formatResponse.toolError(
146+
"OpenRouter API key is required for image generation. Please configure it in the Image Generation experimental settings.",
147+
),
148+
)
149+
return
150+
}
151+
// Get selected model or use default for OpenRouter
152+
const models = getImageGenerationModelsForProvider("openrouter")
153+
selectedModel =
154+
state?.openRouterImageGenerationSelectedModel ||
155+
(models[0]?.modelId ?? "google/gemini-2.5-flash-image-preview")
156+
} else if (selectedProvider === "gemini") {
157+
// For Gemini, we can use the existing Gemini API key from the provider settings
158+
// Check for a dedicated image generation API key first, then fall back to the provider's API key
159+
apiKey =
160+
state?.geminiImageApiKey ||
161+
(state?.apiConfiguration?.apiProvider === "gemini" ? state?.apiConfiguration?.geminiApiKey : undefined)
162+
if (!apiKey) {
163+
await cline.say(
164+
"error",
165+
"Gemini API key is required for image generation. Please configure it in the Image Generation experimental settings or in the Gemini provider settings.",
166+
)
167+
pushToolResult(
168+
formatResponse.toolError(
169+
"Gemini API key is required for image generation. Please configure it in the Image Generation experimental settings or in the Gemini provider settings.",
170+
),
171+
)
172+
return
173+
}
174+
// Get selected model or use default for Gemini
175+
const models = getImageGenerationModelsForProvider("gemini")
176+
selectedModel =
177+
state?.geminiImageGenerationSelectedModel || (models[0]?.modelId ?? "gemini-2.5-flash-image-preview")
178+
} else {
179+
await cline.say("error", `Unsupported image generation provider: ${selectedProvider}`)
180+
pushToolResult(formatResponse.toolError(`Unsupported image generation provider: ${selectedProvider}`))
144181
return
145182
}
146183

147-
// Get selected model from settings or use default
148-
const selectedModel = state?.openRouterImageGenerationSelectedModel || IMAGE_GENERATION_MODELS[0]
149-
150184
// Determine if the path is outside the workspace
151185
const fullPath = path.resolve(cline.cwd, removeClosingTag("path", relPath))
152186
const isOutsideWorkspace = isPathOutsideWorkspace(fullPath)
@@ -176,16 +210,28 @@ export async function generateImageTool(
176210
return
177211
}
178212

179-
// Create a temporary OpenRouter handler with minimal options
180-
const openRouterHandler = new OpenRouterHandler({} as any)
181-
182-
// Call the generateImage method with the explicit API key and optional input image
183-
const result = await openRouterHandler.generateImage(
184-
prompt,
185-
selectedModel,
186-
openRouterApiKey,
187-
inputImageData,
188-
)
213+
// Generate image based on provider
214+
let result
215+
216+
if (selectedProvider === "openrouter") {
217+
// Create a temporary OpenRouter handler with minimal options
218+
const openRouterHandler = new OpenRouterHandler({} as any)
219+
220+
// Call the generateImage method with the explicit API key and optional input image
221+
result = await openRouterHandler.generateImage(prompt, selectedModel, apiKey!, inputImageData)
222+
} else if (selectedProvider === "gemini") {
223+
// Create a temporary Gemini handler with minimal options
224+
const geminiHandler = new GeminiHandler({ geminiApiKey: apiKey } as any)
225+
226+
// Call the generateImage method with the optional input image
227+
result = await geminiHandler.generateImage(prompt, selectedModel, apiKey, inputImageData)
228+
} else {
229+
// This should not happen due to earlier check, but for type safety
230+
result = {
231+
success: false,
232+
error: `Unsupported provider: ${selectedProvider}`,
233+
}
234+
}
189235

190236
if (!result.success) {
191237
await cline.say("error", result.error || "Failed to generate image")

0 commit comments

Comments
 (0)