Skip to content

Commit 0dfbae6

Browse files
authored
Allow users to toggle Gemini caching on / off for OpenRouter (RooCodeInc#2927)
1 parent 5c2511e commit 0dfbae6

File tree

9 files changed

+140
-72
lines changed

9 files changed

+140
-72
lines changed

.changeset/thin-tigers-yawn.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"roo-cline": patch
3+
---
4+
5+
Allow users to turn prompt caching on / off for Gemini 2.5 on OpenRouter

src/api/providers/__tests__/gemini.test.ts

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,7 @@ describe("GeminiHandler", () => {
7474
expect(chunks.length).toBe(3)
7575
expect(chunks[0]).toEqual({ type: "text", text: "Hello" })
7676
expect(chunks[1]).toEqual({ type: "text", text: " world!" })
77-
expect(chunks[2]).toEqual({
78-
type: "usage",
79-
inputTokens: 10,
80-
outputTokens: 5,
81-
cacheReadTokens: undefined,
82-
cacheWriteTokens: undefined,
83-
thinkingTokens: undefined,
84-
})
77+
expect(chunks[2]).toEqual({ type: "usage", inputTokens: 10, outputTokens: 5 })
8578

8679
// Verify the call to generateContentStream
8780
expect(handler["client"].models.generateContentStream).toHaveBeenCalledWith(

src/api/providers/__tests__/openrouter.test.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,14 @@ describe("OpenRouterHandler", () => {
5454
id: mockOptions.openRouterModelId,
5555
info: mockOptions.openRouterModelInfo,
5656
maxTokens: 1000,
57-
reasoning: undefined,
58-
temperature: 0,
5957
thinking: undefined,
58+
temperature: 0,
59+
reasoningEffort: undefined,
6060
topP: undefined,
61+
promptCache: {
62+
supported: false,
63+
optional: false,
64+
},
6165
})
6266
})
6367

src/api/providers/fetchers/__tests__/fixtures/openrouter-models.json

Lines changed: 3 additions & 3 deletions
Large diffs are not rendered by default.

src/api/providers/fetchers/__tests__/openrouter.test.ts

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import { PROMPT_CACHING_MODELS } from "../../../../shared/api"
99
import { getOpenRouterModels } from "../openrouter"
1010

1111
nockBack.fixtures = path.join(__dirname, "fixtures")
12-
nockBack.setMode("dryrun")
12+
nockBack.setMode("lockdown")
1313

1414
describe("OpenRouter API", () => {
1515
describe("getOpenRouterModels", () => {
@@ -66,6 +66,31 @@ describe("OpenRouter API", () => {
6666
supportsComputerUse: true,
6767
})
6868

69+
expect(
70+
Object.entries(models)
71+
.filter(([id, _]) => id.startsWith("anthropic/claude-3"))
72+
.map(([id, model]) => ({ id, maxTokens: model.maxTokens }))
73+
.sort(({ id: a }, { id: b }) => a.localeCompare(b)),
74+
).toEqual([
75+
{ id: "anthropic/claude-3-haiku", maxTokens: 4096 },
76+
{ id: "anthropic/claude-3-haiku:beta", maxTokens: 4096 },
77+
{ id: "anthropic/claude-3-opus", maxTokens: 4096 },
78+
{ id: "anthropic/claude-3-opus:beta", maxTokens: 4096 },
79+
{ id: "anthropic/claude-3-sonnet", maxTokens: 4096 },
80+
{ id: "anthropic/claude-3-sonnet:beta", maxTokens: 4096 },
81+
{ id: "anthropic/claude-3.5-haiku", maxTokens: 8192 },
82+
{ id: "anthropic/claude-3.5-haiku-20241022", maxTokens: 8192 },
83+
{ id: "anthropic/claude-3.5-haiku-20241022:beta", maxTokens: 8192 },
84+
{ id: "anthropic/claude-3.5-haiku:beta", maxTokens: 8192 },
85+
{ id: "anthropic/claude-3.5-sonnet", maxTokens: 8192 },
86+
{ id: "anthropic/claude-3.5-sonnet-20240620", maxTokens: 8192 },
87+
{ id: "anthropic/claude-3.5-sonnet-20240620:beta", maxTokens: 8192 },
88+
{ id: "anthropic/claude-3.5-sonnet:beta", maxTokens: 8192 },
89+
{ id: "anthropic/claude-3.7-sonnet", maxTokens: 8192 },
90+
{ id: "anthropic/claude-3.7-sonnet:beta", maxTokens: 8192 },
91+
{ id: "anthropic/claude-3.7-sonnet:thinking", maxTokens: 128000 },
92+
])
93+
6994
nockDone()
7095
})
7196
})

src/api/providers/fetchers/openrouter.ts

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
import axios from "axios"
22
import { z } from "zod"
33

4-
import { ApiHandlerOptions, ModelInfo } from "../../../shared/api"
4+
import {
5+
ApiHandlerOptions,
6+
ModelInfo,
7+
anthropicModels,
8+
COMPUTER_USE_MODELS,
9+
OPTIONAL_PROMPT_CACHING_MODELS,
10+
} from "../../../shared/api"
511
import { parseApiPrice } from "../../../utils/cost"
612

713
// https://openrouter.ai/api/v1/models
@@ -62,8 +68,8 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
6268
? parseApiPrice(rawModel.pricing?.input_cache_read)
6369
: undefined
6470

65-
// Disable prompt caching for Gemini models for now.
66-
const supportsPromptCache = !!cacheWritesPrice && !!cacheReadsPrice && !rawModel.id.startsWith("google")
71+
const supportsPromptCache =
72+
typeof cacheWritesPrice !== "undefined" && typeof cacheReadsPrice !== "undefined"
6773

6874
const modelInfo: ModelInfo = {
6975
maxTokens: rawModel.top_provider?.max_completion_tokens,
@@ -78,29 +84,25 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
7884
thinking: rawModel.id === "anthropic/claude-3.7-sonnet:thinking",
7985
}
8086

81-
// Then OpenRouter model definition doesn't give us any hints about computer use,
82-
// so we need to set that manually.
83-
// The ideal `maxTokens` values are model dependent, but we should probably DRY
84-
// this up and use the values defined for the Anthropic providers.
85-
switch (true) {
86-
case rawModel.id.startsWith("anthropic/claude-3.7-sonnet"):
87-
modelInfo.supportsComputerUse = true
88-
modelInfo.maxTokens = rawModel.id === "anthropic/claude-3.7-sonnet:thinking" ? 128_000 : 8192
89-
break
90-
case rawModel.id.startsWith("anthropic/claude-3.5-sonnet-20240620"):
91-
modelInfo.maxTokens = 8192
92-
break
93-
case rawModel.id.startsWith("anthropic/claude-3.5-sonnet"):
94-
modelInfo.supportsComputerUse = true
95-
modelInfo.maxTokens = 8192
96-
break
97-
case rawModel.id.startsWith("anthropic/claude-3-5-haiku"):
98-
case rawModel.id.startsWith("anthropic/claude-3-opus"):
99-
case rawModel.id.startsWith("anthropic/claude-3-haiku"):
100-
modelInfo.maxTokens = 8192
101-
break
102-
default:
103-
break
87+
// The OpenRouter model definition doesn't give us any hints about
88+
// computer use, so we need to set that manually.
89+
if (COMPUTER_USE_MODELS.has(rawModel.id)) {
90+
modelInfo.supportsComputerUse = true
91+
}
92+
93+
// We want to treat prompt caching as "experimental" for these models.
94+
if (OPTIONAL_PROMPT_CACHING_MODELS.has(rawModel.id)) {
95+
modelInfo.isPromptCacheOptional = true
96+
}
97+
98+
// Claude 3.7 Sonnet is a "hybrid" thinking model, and the `maxTokens`
99+
// values can be configured. For the non-thinking variant we want to
100+
// use 8k. The `thinking` variant can be run in 64k and 128k modes,
101+
// and we want to use 128k.
102+
if (rawModel.id.startsWith("anthropic/claude-3.7-sonnet")) {
103+
modelInfo.maxTokens = rawModel.id.includes("thinking")
104+
? anthropicModels["claude-3-7-sonnet-20250219:thinking"].maxTokens
105+
: anthropicModels["claude-3-7-sonnet-20250219"].maxTokens
104106
}
105107

106108
models[rawModel.id] = modelInfo

src/api/providers/gemini.ts

Lines changed: 37 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
5454

5555
let uncachedContent: Content[] | undefined = undefined
5656
let cachedContent: string | undefined = undefined
57-
let cacheWriteTokens: number | undefined = undefined
5857

5958
// The minimum input token count for context caching is 4,096.
6059
// For a basic approximation we assume 4 characters per token.
@@ -67,6 +66,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
6766
cacheKey &&
6867
contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM
6968

69+
let cacheWrite = false
70+
7071
if (isCacheAvailable) {
7172
const cacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)
7273

@@ -97,9 +98,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
9798

9899
if (name) {
99100
this.contentCaches.set<CacheEntry>(cacheKey, { key: name, count: contents.length })
100-
cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0
101101
console.log(
102-
`[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`,
102+
`[GeminiHandler] cached ${contents.length} messages (${usageMetadata?.totalTokenCount ?? "-"} tokens) in ${Date.now() - timestamp}ms`,
103103
)
104104
}
105105
})
@@ -109,6 +109,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
109109
.finally(() => {
110110
this.isCacheBusy = false
111111
})
112+
113+
cacheWrite = true
112114
}
113115
}
114116

@@ -146,27 +148,24 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
146148
if (lastUsageMetadata) {
147149
const inputTokens = lastUsageMetadata.promptTokenCount ?? 0
148150
const outputTokens = lastUsageMetadata.candidatesTokenCount ?? 0
151+
const cacheWriteTokens = cacheWrite ? inputTokens : undefined
149152
const cacheReadTokens = lastUsageMetadata.cachedContentTokenCount
150153
const reasoningTokens = lastUsageMetadata.thoughtsTokenCount
151154

152-
const totalCost = isCacheUsed
153-
? this.calculateCost({
154-
info,
155-
inputTokens,
156-
outputTokens,
157-
cacheWriteTokens,
158-
cacheReadTokens,
159-
})
160-
: undefined
161-
162155
yield {
163156
type: "usage",
164157
inputTokens,
165158
outputTokens,
166159
cacheWriteTokens,
167160
cacheReadTokens,
168161
reasoningTokens,
169-
totalCost,
162+
totalCost: this.calculateCost({
163+
info,
164+
inputTokens,
165+
outputTokens,
166+
cacheWriteTokens,
167+
cacheReadTokens,
168+
}),
170169
}
171170
}
172171
}
@@ -250,8 +249,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
250249
info,
251250
inputTokens,
252251
outputTokens,
253-
cacheWriteTokens,
254-
cacheReadTokens,
252+
cacheWriteTokens = 0,
253+
cacheReadTokens = 0,
255254
}: {
256255
info: ModelInfo
257256
inputTokens: number
@@ -281,21 +280,32 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
281280
}
282281
}
283282

284-
let inputTokensCost = inputPrice * (inputTokens / 1_000_000)
285-
let outputTokensCost = outputPrice * (outputTokens / 1_000_000)
286-
let cacheWriteCost = 0
287-
let cacheReadCost = 0
283+
// Subtract the cached input tokens from the total input tokens.
284+
const uncachedInputTokens = inputTokens - cacheReadTokens
288285

289-
if (cacheWriteTokens) {
290-
cacheWriteCost = cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60)
286+
let cacheWriteCost =
287+
cacheWriteTokens > 0 ? cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60) : 0
288+
let cacheReadCost = cacheReadTokens > 0 ? cacheReadsPrice * (cacheReadTokens / 1_000_000) : 0
289+
290+
const inputTokensCost = inputPrice * (uncachedInputTokens / 1_000_000)
291+
const outputTokensCost = outputPrice * (outputTokens / 1_000_000)
292+
const totalCost = inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost
293+
294+
const trace: Record<string, { price: number; tokens: number; cost: number }> = {
295+
input: { price: inputPrice, tokens: uncachedInputTokens, cost: inputTokensCost },
296+
output: { price: outputPrice, tokens: outputTokens, cost: outputTokensCost },
291297
}
292298

293-
if (cacheReadTokens) {
294-
const uncachedReadTokens = inputTokens - cacheReadTokens
295-
cacheReadCost = cacheReadsPrice * (cacheReadTokens / 1_000_000)
296-
inputTokensCost = inputPrice * (uncachedReadTokens / 1_000_000)
299+
if (cacheWriteTokens > 0) {
300+
trace.cacheWrite = { price: cacheWritesPrice, tokens: cacheWriteTokens, cost: cacheWriteCost }
297301
}
298302

299-
return inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost
303+
if (cacheReadTokens > 0) {
304+
trace.cacheRead = { price: cacheReadsPrice, tokens: cacheReadTokens, cost: cacheReadCost }
305+
}
306+
307+
// console.log(`[GeminiHandler] calculateCost -> ${totalCost}`, trace)
308+
309+
return totalCost
300310
}
301311
}

src/api/providers/openrouter.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import {
77
openRouterDefaultModelId,
88
openRouterDefaultModelInfo,
99
PROMPT_CACHING_MODELS,
10+
OPTIONAL_PROMPT_CACHING_MODELS,
1011
} from "../../shared/api"
1112
import { convertToOpenAiMessages } from "../transform/openai-format"
1213
import { ApiStreamChunk } from "../transform/stream"
@@ -65,7 +66,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
6566
systemPrompt: string,
6667
messages: Anthropic.Messages.MessageParam[],
6768
): AsyncGenerator<ApiStreamChunk> {
68-
let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort, info } = this.getModel()
69+
let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort, promptCache } = this.getModel()
6970

7071
// Convert Anthropic messages to OpenAI format.
7172
let openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
@@ -78,11 +79,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
7879
openAiMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
7980
}
8081

82+
const isCacheAvailable = promptCache.supported && (!promptCache.optional || this.options.promptCachingEnabled)
83+
8184
// Prompt caching: https://openrouter.ai/docs/prompt-caching
8285
// Now with Gemini support: https://openrouter.ai/docs/features/prompt-caching
8386
// Note that we don't check the `ModelInfo` object because it is cached
8487
// in the settings for OpenRouter and the value could be stale.
85-
if (PROMPT_CACHING_MODELS.has(modelId)) {
88+
if (isCacheAvailable) {
8689
openAiMessages[0] = {
8790
role: "system",
8891
// @ts-ignore-next-line
@@ -193,8 +196,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
193196
return {
194197
id,
195198
info,
199+
// maxTokens, thinking, temperature, reasoningEffort
196200
...getModelParams({ options: this.options, model: info, defaultTemperature }),
197201
topP,
202+
promptCache: {
203+
supported: PROMPT_CACHING_MODELS.has(id),
204+
optional: OPTIONAL_PROMPT_CACHING_MODELS.has(id),
205+
},
198206
}
199207
}
200208

src/shared/api.ts

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1401,8 +1401,10 @@ export const vscodeLlmModels = {
14011401
* Constants
14021402
*/
14031403

1404+
// These models support reasoning efforts.
14041405
export const REASONING_MODELS = new Set(["x-ai/grok-3-mini-beta", "grok-3-mini-beta", "grok-3-mini-fast-beta"])
14051406

1407+
// These models support prompt caching.
14061408
export const PROMPT_CACHING_MODELS = new Set([
14071409
"anthropic/claude-3-haiku",
14081410
"anthropic/claude-3-haiku:beta",
@@ -1421,7 +1423,26 @@ export const PROMPT_CACHING_MODELS = new Set([
14211423
"anthropic/claude-3.7-sonnet",
14221424
"anthropic/claude-3.7-sonnet:beta",
14231425
"anthropic/claude-3.7-sonnet:thinking",
1424-
// "google/gemini-2.0-flash-001",
1425-
// "google/gemini-flash-1.5",
1426-
// "google/gemini-flash-1.5-8b",
1426+
"google/gemini-2.5-pro-preview-03-25",
1427+
"google/gemini-2.0-flash-001",
1428+
"google/gemini-flash-1.5",
1429+
"google/gemini-flash-1.5-8b",
1430+
])
1431+
1432+
// These models don't have prompt caching enabled by default (you can turn it on
1433+
// in settings).
1434+
export const OPTIONAL_PROMPT_CACHING_MODELS = new Set([
1435+
"google/gemini-2.5-pro-preview-03-25",
1436+
"google/gemini-2.0-flash-001",
1437+
"google/gemini-flash-1.5",
1438+
"google/gemini-flash-1.5-8b",
1439+
])
1440+
1441+
// https://www.anthropic.com/news/3-5-models-and-computer-use
1442+
export const COMPUTER_USE_MODELS = new Set([
1443+
"anthropic/claude-3.5-sonnet",
1444+
"anthropic/claude-3.5-sonnet:beta",
1445+
"anthropic/claude-3.7-sonnet",
1446+
"anthropic/claude-3.7-sonnet:beta",
1447+
"anthropic/claude-3.7-sonnet:thinking",
14271448
])

0 commit comments

Comments
 (0)