Allow users to toggle Gemini caching on / off for OpenRouter (RooCodeInc#2927)

cte · web-flow · commit 0dfbae64f3d8 · 2025-04-24T16:34:22.000-07:00
diff --git a/.changeset/thin-tigers-yawn.md b/.changeset/thin-tigers-yawn.md
@@ -0,0 +1,5 @@
+---
+"roo-cline": patch
+---
+
+Allow users to turn prompt caching on / off for Gemini 2.5 on OpenRouter
diff --git a/src/api/providers/__tests__/gemini.test.ts b/src/api/providers/__tests__/gemini.test.ts
@@ -74,14 +74,7 @@ describe("GeminiHandler", () => {
 			expect(chunks.length).toBe(3)
 			expect(chunks[0]).toEqual({ type: "text", text: "Hello" })
 			expect(chunks[1]).toEqual({ type: "text", text: " world!" })
-			expect(chunks[2]).toEqual({
-				type: "usage",
-				inputTokens: 10,
-				outputTokens: 5,
-				cacheReadTokens: undefined,
-				cacheWriteTokens: undefined,
-				thinkingTokens: undefined,
-			})
+			expect(chunks[2]).toEqual({ type: "usage", inputTokens: 10, outputTokens: 5 })
 
 			// Verify the call to generateContentStream
 			expect(handler["client"].models.generateContentStream).toHaveBeenCalledWith(
diff --git a/src/api/providers/__tests__/openrouter.test.ts b/src/api/providers/__tests__/openrouter.test.ts
@@ -54,10 +54,14 @@ describe("OpenRouterHandler", () => {
 				id: mockOptions.openRouterModelId,
 				info: mockOptions.openRouterModelInfo,
 				maxTokens: 1000,
-				reasoning: undefined,
-				temperature: 0,
 				thinking: undefined,
+				temperature: 0,
+				reasoningEffort: undefined,
 				topP: undefined,
+				promptCache: {
+					supported: false,
+					optional: false,
+				},
 			})
 		})
 
diff --git a/src/api/providers/fetchers/__tests__/fixtures/openrouter-models.json b/src/api/providers/fetchers/__tests__/fixtures/openrouter-models.json
diff --git a/src/api/providers/fetchers/__tests__/openrouter.test.ts b/src/api/providers/fetchers/__tests__/openrouter.test.ts
@@ -9,7 +9,7 @@ import { PROMPT_CACHING_MODELS } from "../../../../shared/api"
 import { getOpenRouterModels } from "../openrouter"
 
 nockBack.fixtures = path.join(__dirname, "fixtures")
-nockBack.setMode("dryrun")
+nockBack.setMode("lockdown")
 
 describe("OpenRouter API", () => {
 	describe("getOpenRouterModels", () => {
@@ -66,6 +66,31 @@ describe("OpenRouter API", () => {
 				supportsComputerUse: true,
 			})
 
+			expect(
+				Object.entries(models)
+					.filter(([id, _]) => id.startsWith("anthropic/claude-3"))
+					.map(([id, model]) => ({ id, maxTokens: model.maxTokens }))
+					.sort(({ id: a }, { id: b }) => a.localeCompare(b)),
+			).toEqual([
+				{ id: "anthropic/claude-3-haiku", maxTokens: 4096 },
+				{ id: "anthropic/claude-3-haiku:beta", maxTokens: 4096 },
+				{ id: "anthropic/claude-3-opus", maxTokens: 4096 },
+				{ id: "anthropic/claude-3-opus:beta", maxTokens: 4096 },
+				{ id: "anthropic/claude-3-sonnet", maxTokens: 4096 },
+				{ id: "anthropic/claude-3-sonnet:beta", maxTokens: 4096 },
+				{ id: "anthropic/claude-3.5-haiku", maxTokens: 8192 },
+				{ id: "anthropic/claude-3.5-haiku-20241022", maxTokens: 8192 },
+				{ id: "anthropic/claude-3.5-haiku-20241022:beta", maxTokens: 8192 },
+				{ id: "anthropic/claude-3.5-haiku:beta", maxTokens: 8192 },
+				{ id: "anthropic/claude-3.5-sonnet", maxTokens: 8192 },
+				{ id: "anthropic/claude-3.5-sonnet-20240620", maxTokens: 8192 },
+				{ id: "anthropic/claude-3.5-sonnet-20240620:beta", maxTokens: 8192 },
+				{ id: "anthropic/claude-3.5-sonnet:beta", maxTokens: 8192 },
+				{ id: "anthropic/claude-3.7-sonnet", maxTokens: 8192 },
+				{ id: "anthropic/claude-3.7-sonnet:beta", maxTokens: 8192 },
+				{ id: "anthropic/claude-3.7-sonnet:thinking", maxTokens: 128000 },
+			])
+
 			nockDone()
 		})
 	})
diff --git a/src/api/providers/fetchers/openrouter.ts b/src/api/providers/fetchers/openrouter.ts
@@ -1,7 +1,13 @@
 import axios from "axios"
 import { z } from "zod"
 
-import { ApiHandlerOptions, ModelInfo } from "../../../shared/api"
+import {
+	ApiHandlerOptions,
+	ModelInfo,
+	anthropicModels,
+	COMPUTER_USE_MODELS,
+	OPTIONAL_PROMPT_CACHING_MODELS,
+} from "../../../shared/api"
 import { parseApiPrice } from "../../../utils/cost"
 
 // https://openrouter.ai/api/v1/models
@@ -62,8 +68,8 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
 				? parseApiPrice(rawModel.pricing?.input_cache_read)
 				: undefined
 
-			// Disable prompt caching for Gemini models for now.
-			const supportsPromptCache = !!cacheWritesPrice && !!cacheReadsPrice && !rawModel.id.startsWith("google")
+			const supportsPromptCache =
+				typeof cacheWritesPrice !== "undefined" && typeof cacheReadsPrice !== "undefined"
 
 			const modelInfo: ModelInfo = {
 				maxTokens: rawModel.top_provider?.max_completion_tokens,
@@ -78,29 +84,25 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
 				thinking: rawModel.id === "anthropic/claude-3.7-sonnet:thinking",
 			}
 
-			// Then OpenRouter model definition doesn't give us any hints about computer use,
-			// so we need to set that manually.
-			// The ideal `maxTokens` values are model dependent, but we should probably DRY
-			// this up and use the values defined for the Anthropic providers.
-			switch (true) {
-				case rawModel.id.startsWith("anthropic/claude-3.7-sonnet"):
-					modelInfo.supportsComputerUse = true
-					modelInfo.maxTokens = rawModel.id === "anthropic/claude-3.7-sonnet:thinking" ? 128_000 : 8192
-					break
-				case rawModel.id.startsWith("anthropic/claude-3.5-sonnet-20240620"):
-					modelInfo.maxTokens = 8192
-					break
-				case rawModel.id.startsWith("anthropic/claude-3.5-sonnet"):
-					modelInfo.supportsComputerUse = true
-					modelInfo.maxTokens = 8192
-					break
-				case rawModel.id.startsWith("anthropic/claude-3-5-haiku"):
-				case rawModel.id.startsWith("anthropic/claude-3-opus"):
-				case rawModel.id.startsWith("anthropic/claude-3-haiku"):
-					modelInfo.maxTokens = 8192
-					break
-				default:
-					break
+			// The OpenRouter model definition doesn't give us any hints about
+			// computer use, so we need to set that manually.
+			if (COMPUTER_USE_MODELS.has(rawModel.id)) {
+				modelInfo.supportsComputerUse = true
+			}
+
+			// We want to treat prompt caching as "experimental" for these models.
+			if (OPTIONAL_PROMPT_CACHING_MODELS.has(rawModel.id)) {
+				modelInfo.isPromptCacheOptional = true
+			}
+
+			// Claude 3.7 Sonnet is a "hybrid" thinking model, and the `maxTokens`
+			// values can be configured. For the non-thinking variant we want to
+			// use 8k. The `thinking` variant can be run in 64k and 128k modes,
+			// and we want to use 128k.
+			if (rawModel.id.startsWith("anthropic/claude-3.7-sonnet")) {
+				modelInfo.maxTokens = rawModel.id.includes("thinking")
+					? anthropicModels["claude-3-7-sonnet-20250219:thinking"].maxTokens
+					: anthropicModels["claude-3-7-sonnet-20250219"].maxTokens
 			}
 
 			models[rawModel.id] = modelInfo
diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts
@@ -54,7 +54,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
 		let uncachedContent: Content[] | undefined = undefined
 		let cachedContent: string | undefined = undefined
-		let cacheWriteTokens: number | undefined = undefined
 
 		// The minimum input token count for context caching is 4,096.
 		// For a basic approximation we assume 4 characters per token.
@@ -67,6 +66,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 			cacheKey &&
 			contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM
 
+		let cacheWrite = false
+
 		if (isCacheAvailable) {
 			const cacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)
 
@@ -97,9 +98,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 
 						if (name) {
 							this.contentCaches.set<CacheEntry>(cacheKey, { key: name, count: contents.length })
-							cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0
 							console.log(
-								`[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`,
+								`[GeminiHandler] cached ${contents.length} messages (${usageMetadata?.totalTokenCount ?? "-"} tokens) in ${Date.now() - timestamp}ms`,
 							)
 						}
 					})
@@ -109,6 +109,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 					.finally(() => {
 						this.isCacheBusy = false
 					})
+
+				cacheWrite = true
 			}
 		}
 
@@ -146,27 +148,24 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		if (lastUsageMetadata) {
 			const inputTokens = lastUsageMetadata.promptTokenCount ?? 0
 			const outputTokens = lastUsageMetadata.candidatesTokenCount ?? 0
+			const cacheWriteTokens = cacheWrite ? inputTokens : undefined
 			const cacheReadTokens = lastUsageMetadata.cachedContentTokenCount
 			const reasoningTokens = lastUsageMetadata.thoughtsTokenCount
 
-			const totalCost = isCacheUsed
-				? this.calculateCost({
-						info,
-						inputTokens,
-						outputTokens,
-						cacheWriteTokens,
-						cacheReadTokens,
-					})
-				: undefined
-
 			yield {
 				type: "usage",
 				inputTokens,
 				outputTokens,
 				cacheWriteTokens,
 				cacheReadTokens,
 				reasoningTokens,
-				totalCost,
+				totalCost: this.calculateCost({
+					info,
+					inputTokens,
+					outputTokens,
+					cacheWriteTokens,
+					cacheReadTokens,
+				}),
 			}
 		}
 	}
@@ -250,8 +249,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		info,
 		inputTokens,
 		outputTokens,
-		cacheWriteTokens,
-		cacheReadTokens,
+		cacheWriteTokens = 0,
+		cacheReadTokens = 0,
 	}: {
 		info: ModelInfo
 		inputTokens: number
@@ -281,21 +280,32 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 			}
 		}
 
-		let inputTokensCost = inputPrice * (inputTokens / 1_000_000)
-		let outputTokensCost = outputPrice * (outputTokens / 1_000_000)
-		let cacheWriteCost = 0
-		let cacheReadCost = 0
+		// Subtract the cached input tokens from the total input tokens.
+		const uncachedInputTokens = inputTokens - cacheReadTokens
 
-		if (cacheWriteTokens) {
-			cacheWriteCost = cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60)
+		let cacheWriteCost =
+			cacheWriteTokens > 0 ? cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60) : 0
+		let cacheReadCost = cacheReadTokens > 0 ? cacheReadsPrice * (cacheReadTokens / 1_000_000) : 0
+
+		const inputTokensCost = inputPrice * (uncachedInputTokens / 1_000_000)
+		const outputTokensCost = outputPrice * (outputTokens / 1_000_000)
+		const totalCost = inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost
+
+		const trace: Record<string, { price: number; tokens: number; cost: number }> = {
+			input: { price: inputPrice, tokens: uncachedInputTokens, cost: inputTokensCost },
+			output: { price: outputPrice, tokens: outputTokens, cost: outputTokensCost },
 		}
 
-		if (cacheReadTokens) {
-			const uncachedReadTokens = inputTokens - cacheReadTokens
-			cacheReadCost = cacheReadsPrice * (cacheReadTokens / 1_000_000)
-			inputTokensCost = inputPrice * (uncachedReadTokens / 1_000_000)
+		if (cacheWriteTokens > 0) {
+			trace.cacheWrite = { price: cacheWritesPrice, tokens: cacheWriteTokens, cost: cacheWriteCost }
 		}
 
-		return inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost
+		if (cacheReadTokens > 0) {
+			trace.cacheRead = { price: cacheReadsPrice, tokens: cacheReadTokens, cost: cacheReadCost }
+		}
+
+		// console.log(`[GeminiHandler] calculateCost -> ${totalCost}`, trace)
+
+		return totalCost
 	}
 }
diff --git a/src/api/providers/openrouter.ts b/src/api/providers/openrouter.ts
@@ -7,6 +7,7 @@ import {
 	openRouterDefaultModelId,
 	openRouterDefaultModelInfo,
 	PROMPT_CACHING_MODELS,
+	OPTIONAL_PROMPT_CACHING_MODELS,
 } from "../../shared/api"
 import { convertToOpenAiMessages } from "../transform/openai-format"
 import { ApiStreamChunk } from "../transform/stream"
@@ -65,7 +66,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 		systemPrompt: string,
 		messages: Anthropic.Messages.MessageParam[],
 	): AsyncGenerator<ApiStreamChunk> {
-		let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort, info } = this.getModel()
+		let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort, promptCache } = this.getModel()
 
 		// Convert Anthropic messages to OpenAI format.
 		let openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
@@ -78,11 +79,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 			openAiMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
 		}
 
+		const isCacheAvailable = promptCache.supported && (!promptCache.optional || this.options.promptCachingEnabled)
+
 		// Prompt caching: https://openrouter.ai/docs/prompt-caching
 		// Now with Gemini support: https://openrouter.ai/docs/features/prompt-caching
 		// Note that we don't check the `ModelInfo` object because it is cached
 		// in the settings for OpenRouter and the value could be stale.
-		if (PROMPT_CACHING_MODELS.has(modelId)) {
+		if (isCacheAvailable) {
 			openAiMessages[0] = {
 				role: "system",
 				// @ts-ignore-next-line
@@ -193,8 +196,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
 		return {
 			id,
 			info,
+			// maxTokens, thinking, temperature, reasoningEffort
 			...getModelParams({ options: this.options, model: info, defaultTemperature }),
 			topP,
+			promptCache: {
+				supported: PROMPT_CACHING_MODELS.has(id),
+				optional: OPTIONAL_PROMPT_CACHING_MODELS.has(id),
+			},
 		}
 	}
 
diff --git a/src/shared/api.ts b/src/shared/api.ts
@@ -1401,8 +1401,10 @@ export const vscodeLlmModels = {
  * Constants
  */
 
+// These models support reasoning efforts.
 export const REASONING_MODELS = new Set(["x-ai/grok-3-mini-beta", "grok-3-mini-beta", "grok-3-mini-fast-beta"])
 
+// These models support prompt caching.
 export const PROMPT_CACHING_MODELS = new Set([
 	"anthropic/claude-3-haiku",
 	"anthropic/claude-3-haiku:beta",
@@ -1421,7 +1423,26 @@ export const PROMPT_CACHING_MODELS = new Set([
 	"anthropic/claude-3.7-sonnet",
 	"anthropic/claude-3.7-sonnet:beta",
 	"anthropic/claude-3.7-sonnet:thinking",
-	// "google/gemini-2.0-flash-001",
-	// "google/gemini-flash-1.5",
-	// "google/gemini-flash-1.5-8b",
+	"google/gemini-2.5-pro-preview-03-25",
+	"google/gemini-2.0-flash-001",
+	"google/gemini-flash-1.5",
+	"google/gemini-flash-1.5-8b",
+])
+
+// These models don't have prompt caching enabled by default (you can turn it on
+// in settings).
+export const OPTIONAL_PROMPT_CACHING_MODELS = new Set([
+	"google/gemini-2.5-pro-preview-03-25",
+	"google/gemini-2.0-flash-001",
+	"google/gemini-flash-1.5",
+	"google/gemini-flash-1.5-8b",
+])
+
+// https://www.anthropic.com/news/3-5-models-and-computer-use
+export const COMPUTER_USE_MODELS = new Set([
+	"anthropic/claude-3.5-sonnet",
+	"anthropic/claude-3.5-sonnet:beta",
+	"anthropic/claude-3.7-sonnet",
+	"anthropic/claude-3.7-sonnet:beta",
+	"anthropic/claude-3.7-sonnet:thinking",
 ])

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"roo-cline": patch
 +---
++
 +Allow users to turn prompt caching on / off for Gemini 2.5 on OpenRouter