Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion packages/types/src/global-settings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,11 @@ export const globalSettingsSchema = z.object({
dismissedUpsells: z.array(z.string()).optional(),

// Image generation settings (experimental) - flattened for simplicity
imageGenerationProvider: z.enum(["openrouter", "gemini"]).optional(),
openRouterImageApiKey: z.string().optional(),
openRouterImageGenerationSelectedModel: z.string().optional(),
geminiImageApiKey: z.string().optional(),
geminiImageGenerationSelectedModel: z.string().optional(),

condensingApiConfigId: z.string().optional(),
customCondensingPrompt: z.string().optional(),
Expand Down Expand Up @@ -210,7 +213,8 @@ export const SECRET_STATE_KEYS = [

// Global secrets that are part of GlobalSettings (not ProviderSettings)
export const GLOBAL_SECRET_KEYS = [
"openRouterImageApiKey", // For image generation
"openRouterImageApiKey", // For OpenRouter image generation
"geminiImageApiKey", // For Gemini image generation
] as const

// Type for the actual secret storage keys
Expand Down
75 changes: 75 additions & 0 deletions packages/types/src/image-generation.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { z } from "zod"

/**
* Image Generation Provider
*/
export const imageGenerationProviders = ["openrouter", "gemini"] as const
export const imageGenerationProviderSchema = z.enum(imageGenerationProviders)
export type ImageGenerationProvider = z.infer<typeof imageGenerationProviderSchema>

/**
* Image Generation Model Info
*/
export interface ImageGenerationModelInfo {
provider: ImageGenerationProvider
modelId: string
label: string
supportsEditMode?: boolean // Whether the model supports image editing (text + image input)
maxInputSize?: number // Maximum input image size in MB
outputFormats?: string[] // Supported output formats
}

/**
* Image Generation Models by Provider
*/
export const IMAGE_GENERATION_MODELS: Record<ImageGenerationProvider, ImageGenerationModelInfo[]> = {
openrouter: [
{
provider: "openrouter",
modelId: "google/gemini-2.5-flash-image-preview",
label: "Gemini 2.5 Flash Image Preview",
supportsEditMode: true,
outputFormats: ["png", "jpeg"],
},
{
provider: "openrouter",
modelId: "google/gemini-2.5-flash-image-preview:free",
label: "Gemini 2.5 Flash Image Preview (Free)",
supportsEditMode: true,
outputFormats: ["png", "jpeg"],
},
],
gemini: [
{
provider: "gemini",
modelId: "gemini-2.5-flash-image-preview",
label: "Gemini 2.5 Flash Image Preview",
supportsEditMode: true,
outputFormats: ["png", "jpeg"],
},
],
}

/**
* Helper function to get all models for a specific provider
*/
export function getImageGenerationModelsForProvider(provider: ImageGenerationProvider): ImageGenerationModelInfo[] {
return IMAGE_GENERATION_MODELS[provider] || []
}

/**
* Helper function to get all available image generation models
*/
export function getAllImageGenerationModels(): ImageGenerationModelInfo[] {
return Object.values(IMAGE_GENERATION_MODELS).flat()
}

/**
* Image Generation Result
*/
export interface ImageGenerationResult {
success: boolean
imageData?: string // Base64 encoded image data URL
imageFormat?: string // Format of the generated image (png, jpeg, etc.)
error?: string
}
1 change: 1 addition & 0 deletions packages/types/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ export * from "./experiment.js"
export * from "./followup.js"
export * from "./global-settings.js"
export * from "./history.js"
export * from "./image-generation.js"
export * from "./ipc.js"
export * from "./marketplace.js"
export * from "./mcp.js"
Expand Down
144 changes: 143 additions & 1 deletion src/api/providers/gemini.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,17 @@ import {
type GenerateContentParameters,
type GenerateContentConfig,
type GroundingMetadata,
type Content,
} from "@google/genai"
import type { JWTInput } from "google-auth-library"

import { type ModelInfo, type GeminiModelId, geminiDefaultModelId, geminiModels } from "@roo-code/types"
import {
type ModelInfo,
type GeminiModelId,
geminiDefaultModelId,
geminiModels,
type ImageGenerationResult,
} from "@roo-code/types"

import type { ApiHandlerOptions } from "../../shared/api"
import { safeJsonParse } from "../../shared/safeJsonParse"
Expand Down Expand Up @@ -335,4 +342,139 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl

return totalCost
}

/**
* Generate an image using Gemini's image generation API
* @param prompt The text prompt for image generation
* @param model The model to use for generation
* @param apiKey The Gemini API key (if not using vertex)
* @param inputImage Optional base64 encoded input image data URL for editing
* @returns The generated image data and format, or an error
*/
async generateImage(
prompt: string,
model: string,
apiKey?: string,
inputImage?: string,
): Promise<ImageGenerationResult> {
try {
// Create a temporary client with the provided API key if needed
let client: GoogleGenAI
if (apiKey && !this.options.vertexProjectId) {
// Use provided API key for standard Gemini
client = new GoogleGenAI({ apiKey })
} else {
// Use existing client (either vertex or standard with already configured key)
client = this.client
}

// Prepare the content for generation
const contents: Content[] = []

if (inputImage) {
// For image editing mode, include both text and image
const base64Match = inputImage.match(/^data:image\/(png|jpeg|jpg);base64,(.+)$/)
if (!base64Match) {
return {
success: false,
error: "Invalid input image format. Expected base64 data URL.",
}
}

const mimeType = base64Match[1] === "jpg" ? "image/jpeg" : `image/${base64Match[1]}`
const base64Data = base64Match[2]

contents.push({
role: "user",
parts: [
{ text: prompt },
{
inlineData: {
mimeType,
data: base64Data,
},
},
],
})
} else {
// For text-to-image mode
contents.push({
role: "user",
parts: [{ text: prompt }],
})
}

const config: GenerateContentConfig = {
httpOptions: this.options.googleGeminiBaseUrl
? { baseUrl: this.options.googleGeminiBaseUrl }
: undefined,
temperature: 1.0, // Higher temperature for more creative image generation
}

const params: GenerateContentParameters = {
model,
contents,
config,
}

const result = await client.models.generateContent(params)

// Extract the generated image from the response
if (!result.candidates || result.candidates.length === 0) {
return {
success: false,
error: "No candidates returned in the response",
}
}

const candidate = result.candidates[0]
if (!candidate.content || !candidate.content.parts) {
return {
success: false,
error: "No content parts in the response",
}
}

// Find the image part in the response
let imageData: string | undefined
let imageFormat = "png" // Default format

for (const part of candidate.content.parts) {
if (part.inlineData) {
const mimeType = part.inlineData.mimeType
const data = part.inlineData.data

if (mimeType?.startsWith("image/")) {
// Extract format from mime type
imageFormat = mimeType.replace("image/", "").replace("jpeg", "jpg")

// Convert to data URL format
imageData = `data:${mimeType};base64,${data}`
break
}
}
}

if (!imageData) {
return {
success: false,
error: "No image data found in the response",
}
}

return {
success: true,
imageData,
imageFormat,
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : "Unknown error occurred"
console.error("Gemini image generation error:", errorMessage)

return {
success: false,
error: `Failed to generate image: ${errorMessage}`,
}
}
}
}
100 changes: 73 additions & 27 deletions src/core/tools/generateImageTool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ import { getReadablePath } from "../../utils/path"
import { isPathOutsideWorkspace } from "../../utils/pathUtils"
import { EXPERIMENT_IDS, experiments } from "../../shared/experiments"
import { OpenRouterHandler } from "../../api/providers/openrouter"

// Hardcoded list of image generation models for now
const IMAGE_GENERATION_MODELS = ["google/gemini-2.5-flash-image", "openai/gpt-5-image", "openai/gpt-5-image-mini"]
import { GeminiHandler } from "../../api/providers/gemini"
import { ImageGenerationProvider, getImageGenerationModelsForProvider } from "@roo-code/types"

export async function generateImageTool(
cline: Task,
Expand Down Expand Up @@ -128,25 +127,60 @@ export async function generateImageTool(
// Check if file is write-protected
const isWriteProtected = cline.rooProtectedController?.isWriteProtected(relPath) || false

// Get OpenRouter API key from global settings (experimental image generation)
const openRouterApiKey = state?.openRouterImageApiKey
// Get the selected provider from settings (default to openrouter)
const selectedProvider = (state?.imageGenerationProvider || "openrouter") as ImageGenerationProvider

if (!openRouterApiKey) {
await cline.say(
"error",
"OpenRouter API key is required for image generation. Please configure it in the Image Generation experimental settings.",
)
pushToolResult(
formatResponse.toolError(
// Get selected model from settings based on provider
let selectedModel: string
let apiKey: string | undefined

if (selectedProvider === "openrouter") {
apiKey = state?.openRouterImageApiKey
if (!apiKey) {
await cline.say(
"error",
"OpenRouter API key is required for image generation. Please configure it in the Image Generation experimental settings.",
),
)
)
pushToolResult(
formatResponse.toolError(
"OpenRouter API key is required for image generation. Please configure it in the Image Generation experimental settings.",
),
)
return
}
// Get selected model or use default for OpenRouter
const models = getImageGenerationModelsForProvider("openrouter")
selectedModel =
state?.openRouterImageGenerationSelectedModel ||
(models[0]?.modelId ?? "google/gemini-2.5-flash-image-preview")
} else if (selectedProvider === "gemini") {
// For Gemini, we can use the existing Gemini API key from the provider settings
// Check for a dedicated image generation API key first, then fall back to the provider's API key
apiKey =
state?.geminiImageApiKey ||
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fallback to the main Gemini API key is a nice touch for user convenience. However, could we add a comment here explaining this fallback behavior so future maintainers understand the logic?

(state?.apiConfiguration?.apiProvider === "gemini" ? state?.apiConfiguration?.geminiApiKey : undefined)
if (!apiKey) {
await cline.say(
"error",
"Gemini API key is required for image generation. Please configure it in the Image Generation experimental settings or in the Gemini provider settings.",
)
pushToolResult(
formatResponse.toolError(
"Gemini API key is required for image generation. Please configure it in the Image Generation experimental settings or in the Gemini provider settings.",
),
)
return
}
// Get selected model or use default for Gemini
const models = getImageGenerationModelsForProvider("gemini")
selectedModel =
state?.geminiImageGenerationSelectedModel || (models[0]?.modelId ?? "gemini-2.5-flash-image-preview")
} else {
await cline.say("error", `Unsupported image generation provider: ${selectedProvider}`)
pushToolResult(formatResponse.toolError(`Unsupported image generation provider: ${selectedProvider}`))
return
}

// Get selected model from settings or use default
const selectedModel = state?.openRouterImageGenerationSelectedModel || IMAGE_GENERATION_MODELS[0]

// Determine if the path is outside the workspace
const fullPath = path.resolve(cline.cwd, removeClosingTag("path", relPath))
const isOutsideWorkspace = isPathOutsideWorkspace(fullPath)
Expand Down Expand Up @@ -176,16 +210,28 @@ export async function generateImageTool(
return
}

// Create a temporary OpenRouter handler with minimal options
const openRouterHandler = new OpenRouterHandler({} as any)

// Call the generateImage method with the explicit API key and optional input image
const result = await openRouterHandler.generateImage(
prompt,
selectedModel,
openRouterApiKey,
inputImageData,
)
// Generate image based on provider
let result

if (selectedProvider === "openrouter") {
// Create a temporary OpenRouter handler with minimal options
const openRouterHandler = new OpenRouterHandler({} as any)

// Call the generateImage method with the explicit API key and optional input image
result = await openRouterHandler.generateImage(prompt, selectedModel, apiKey!, inputImageData)
} else if (selectedProvider === "gemini") {
// Create a temporary Gemini handler with minimal options
const geminiHandler = new GeminiHandler({ geminiApiKey: apiKey } as any)

// Call the generateImage method with the optional input image
result = await geminiHandler.generateImage(prompt, selectedModel, apiKey, inputImageData)
} else {
// This should not happen due to earlier check, but for type safety
result = {
success: false,
error: `Unsupported provider: ${selectedProvider}`,
}
}

if (!result.success) {
await cline.say("error", result.error || "Failed to generate image")
Expand Down
Loading
Loading