Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/thin-tigers-yawn.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"roo-cline": patch
---

Allow users to turn prompt caching on / off for Gemini 2.5 on OpenRouter
9 changes: 1 addition & 8 deletions src/api/providers/__tests__/gemini.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,7 @@ describe("GeminiHandler", () => {
expect(chunks.length).toBe(3)
expect(chunks[0]).toEqual({ type: "text", text: "Hello" })
expect(chunks[1]).toEqual({ type: "text", text: " world!" })
expect(chunks[2]).toEqual({
type: "usage",
inputTokens: 10,
outputTokens: 5,
cacheReadTokens: undefined,
cacheWriteTokens: undefined,
thinkingTokens: undefined,
})
expect(chunks[2]).toEqual({ type: "usage", inputTokens: 10, outputTokens: 5 })

// Verify the call to generateContentStream
expect(handler["client"].models.generateContentStream).toHaveBeenCalledWith(
Expand Down
8 changes: 6 additions & 2 deletions src/api/providers/__tests__/openrouter.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,14 @@ describe("OpenRouterHandler", () => {
id: mockOptions.openRouterModelId,
info: mockOptions.openRouterModelInfo,
maxTokens: 1000,
reasoning: undefined,
temperature: 0,
thinking: undefined,
temperature: 0,
reasoningEffort: undefined,
topP: undefined,
promptCache: {
supported: false,
optional: false,
},
})
})

Expand Down

Large diffs are not rendered by default.

27 changes: 26 additions & 1 deletion src/api/providers/fetchers/__tests__/openrouter.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import { PROMPT_CACHING_MODELS } from "../../../../shared/api"
import { getOpenRouterModels } from "../openrouter"

nockBack.fixtures = path.join(__dirname, "fixtures")
nockBack.setMode("dryrun")
nockBack.setMode("lockdown")

describe("OpenRouter API", () => {
describe("getOpenRouterModels", () => {
Expand Down Expand Up @@ -66,6 +66,31 @@ describe("OpenRouter API", () => {
supportsComputerUse: true,
})

expect(
Object.entries(models)
.filter(([id, _]) => id.startsWith("anthropic/claude-3"))
.map(([id, model]) => ({ id, maxTokens: model.maxTokens }))
.sort(({ id: a }, { id: b }) => a.localeCompare(b)),
).toEqual([
{ id: "anthropic/claude-3-haiku", maxTokens: 4096 },
{ id: "anthropic/claude-3-haiku:beta", maxTokens: 4096 },
{ id: "anthropic/claude-3-opus", maxTokens: 4096 },
{ id: "anthropic/claude-3-opus:beta", maxTokens: 4096 },
{ id: "anthropic/claude-3-sonnet", maxTokens: 4096 },
{ id: "anthropic/claude-3-sonnet:beta", maxTokens: 4096 },
{ id: "anthropic/claude-3.5-haiku", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-haiku-20241022", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-haiku-20241022:beta", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-haiku:beta", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-sonnet", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-sonnet-20240620", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-sonnet-20240620:beta", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-sonnet:beta", maxTokens: 8192 },
{ id: "anthropic/claude-3.7-sonnet", maxTokens: 8192 },
{ id: "anthropic/claude-3.7-sonnet:beta", maxTokens: 8192 },
{ id: "anthropic/claude-3.7-sonnet:thinking", maxTokens: 128000 },
])

nockDone()
})
})
Expand Down
54 changes: 28 additions & 26 deletions src/api/providers/fetchers/openrouter.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import axios from "axios"
import { z } from "zod"

import { ApiHandlerOptions, ModelInfo } from "../../../shared/api"
import {
ApiHandlerOptions,
ModelInfo,
anthropicModels,
COMPUTER_USE_MODELS,
OPTIONAL_PROMPT_CACHING_MODELS,
} from "../../../shared/api"
import { parseApiPrice } from "../../../utils/cost"

// https://openrouter.ai/api/v1/models
Expand Down Expand Up @@ -62,8 +68,8 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
? parseApiPrice(rawModel.pricing?.input_cache_read)
: undefined

// Disable prompt caching for Gemini models for now.
const supportsPromptCache = !!cacheWritesPrice && !!cacheReadsPrice && !rawModel.id.startsWith("google")
const supportsPromptCache =
typeof cacheWritesPrice !== "undefined" && typeof cacheReadsPrice !== "undefined"

const modelInfo: ModelInfo = {
maxTokens: rawModel.top_provider?.max_completion_tokens,
Expand All @@ -78,29 +84,25 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
thinking: rawModel.id === "anthropic/claude-3.7-sonnet:thinking",
}

// Then OpenRouter model definition doesn't give us any hints about computer use,
// so we need to set that manually.
// The ideal `maxTokens` values are model dependent, but we should probably DRY
// this up and use the values defined for the Anthropic providers.
switch (true) {
case rawModel.id.startsWith("anthropic/claude-3.7-sonnet"):
modelInfo.supportsComputerUse = true
modelInfo.maxTokens = rawModel.id === "anthropic/claude-3.7-sonnet:thinking" ? 128_000 : 8192
break
case rawModel.id.startsWith("anthropic/claude-3.5-sonnet-20240620"):
modelInfo.maxTokens = 8192
break
case rawModel.id.startsWith("anthropic/claude-3.5-sonnet"):
modelInfo.supportsComputerUse = true
modelInfo.maxTokens = 8192
break
case rawModel.id.startsWith("anthropic/claude-3-5-haiku"):
case rawModel.id.startsWith("anthropic/claude-3-opus"):
case rawModel.id.startsWith("anthropic/claude-3-haiku"):
modelInfo.maxTokens = 8192
break
default:
break
// The OpenRouter model definition doesn't give us any hints about
// computer use, so we need to set that manually.
if (COMPUTER_USE_MODELS.has(rawModel.id)) {
modelInfo.supportsComputerUse = true
}

// We want to treat prompt caching as "experimental" for these models.
if (OPTIONAL_PROMPT_CACHING_MODELS.has(rawModel.id)) {
modelInfo.isPromptCacheOptional = true
}

// Claude 3.7 Sonnet is a "hybrid" thinking model, and the `maxTokens`
// values can be configured. For the non-thinking variant we want to
// use 8k. The `thinking` variant can be run in 64k and 128k modes,
// and we want to use 128k.
if (rawModel.id.startsWith("anthropic/claude-3.7-sonnet")) {
modelInfo.maxTokens = rawModel.id.includes("thinking")
? anthropicModels["claude-3-7-sonnet-20250219:thinking"].maxTokens
: anthropicModels["claude-3-7-sonnet-20250219"].maxTokens
}

models[rawModel.id] = modelInfo
Expand Down
64 changes: 37 additions & 27 deletions src/api/providers/gemini.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl

let uncachedContent: Content[] | undefined = undefined
let cachedContent: string | undefined = undefined
let cacheWriteTokens: number | undefined = undefined

// The minimum input token count for context caching is 4,096.
// For a basic approximation we assume 4 characters per token.
Expand All @@ -67,6 +66,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
cacheKey &&
contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM

let cacheWrite = false

if (isCacheAvailable) {
const cacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)

Expand Down Expand Up @@ -97,9 +98,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl

if (name) {
this.contentCaches.set<CacheEntry>(cacheKey, { key: name, count: contents.length })
cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0
console.log(
`[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`,
`[GeminiHandler] cached ${contents.length} messages (${usageMetadata?.totalTokenCount ?? "-"} tokens) in ${Date.now() - timestamp}ms`,
)
}
})
Expand All @@ -109,6 +109,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
.finally(() => {
this.isCacheBusy = false
})

cacheWrite = true
}
}

Expand Down Expand Up @@ -146,27 +148,24 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
if (lastUsageMetadata) {
const inputTokens = lastUsageMetadata.promptTokenCount ?? 0
const outputTokens = lastUsageMetadata.candidatesTokenCount ?? 0
const cacheWriteTokens = cacheWrite ? inputTokens : undefined
const cacheReadTokens = lastUsageMetadata.cachedContentTokenCount
const reasoningTokens = lastUsageMetadata.thoughtsTokenCount

const totalCost = isCacheUsed
? this.calculateCost({
info,
inputTokens,
outputTokens,
cacheWriteTokens,
cacheReadTokens,
})
: undefined

yield {
type: "usage",
inputTokens,
outputTokens,
cacheWriteTokens,
cacheReadTokens,
reasoningTokens,
totalCost,
totalCost: this.calculateCost({
info,
inputTokens,
outputTokens,
cacheWriteTokens,
cacheReadTokens,
}),
}
}
}
Expand Down Expand Up @@ -250,8 +249,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
info,
inputTokens,
outputTokens,
cacheWriteTokens,
cacheReadTokens,
cacheWriteTokens = 0,
cacheReadTokens = 0,
}: {
info: ModelInfo
inputTokens: number
Expand Down Expand Up @@ -281,21 +280,32 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
}
}

let inputTokensCost = inputPrice * (inputTokens / 1_000_000)
let outputTokensCost = outputPrice * (outputTokens / 1_000_000)
let cacheWriteCost = 0
let cacheReadCost = 0
// Subtract the cached input tokens from the total input tokens.
const uncachedInputTokens = inputTokens - cacheReadTokens

if (cacheWriteTokens) {
cacheWriteCost = cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60)
let cacheWriteCost =
cacheWriteTokens > 0 ? cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60) : 0
let cacheReadCost = cacheReadTokens > 0 ? cacheReadsPrice * (cacheReadTokens / 1_000_000) : 0

const inputTokensCost = inputPrice * (uncachedInputTokens / 1_000_000)
const outputTokensCost = outputPrice * (outputTokens / 1_000_000)
const totalCost = inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost

const trace: Record<string, { price: number; tokens: number; cost: number }> = {
input: { price: inputPrice, tokens: uncachedInputTokens, cost: inputTokensCost },
output: { price: outputPrice, tokens: outputTokens, cost: outputTokensCost },
}

if (cacheReadTokens) {
const uncachedReadTokens = inputTokens - cacheReadTokens
cacheReadCost = cacheReadsPrice * (cacheReadTokens / 1_000_000)
inputTokensCost = inputPrice * (uncachedReadTokens / 1_000_000)
if (cacheWriteTokens > 0) {
trace.cacheWrite = { price: cacheWritesPrice, tokens: cacheWriteTokens, cost: cacheWriteCost }
}

return inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost
if (cacheReadTokens > 0) {
trace.cacheRead = { price: cacheReadsPrice, tokens: cacheReadTokens, cost: cacheReadCost }
}

// console.log(`[GeminiHandler] calculateCost -> ${totalCost}`, trace)

return totalCost
}
}
12 changes: 10 additions & 2 deletions src/api/providers/openrouter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {
openRouterDefaultModelId,
openRouterDefaultModelInfo,
PROMPT_CACHING_MODELS,
OPTIONAL_PROMPT_CACHING_MODELS,
} from "../../shared/api"
import { convertToOpenAiMessages } from "../transform/openai-format"
import { ApiStreamChunk } from "../transform/stream"
Expand Down Expand Up @@ -65,7 +66,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
systemPrompt: string,
messages: Anthropic.Messages.MessageParam[],
): AsyncGenerator<ApiStreamChunk> {
let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort, info } = this.getModel()
let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort, promptCache } = this.getModel()

// Convert Anthropic messages to OpenAI format.
let openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
Expand All @@ -78,11 +79,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
openAiMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
}

const isCacheAvailable = promptCache.supported && (!promptCache.optional || this.options.promptCachingEnabled)

// Prompt caching: https://openrouter.ai/docs/prompt-caching
// Now with Gemini support: https://openrouter.ai/docs/features/prompt-caching
// Note that we don't check the `ModelInfo` object because it is cached
// in the settings for OpenRouter and the value could be stale.
if (PROMPT_CACHING_MODELS.has(modelId)) {
if (isCacheAvailable) {
openAiMessages[0] = {
role: "system",
// @ts-ignore-next-line
Expand Down Expand Up @@ -193,8 +196,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
return {
id,
info,
// maxTokens, thinking, temperature, reasoningEffort
...getModelParams({ options: this.options, model: info, defaultTemperature }),
topP,
promptCache: {
supported: PROMPT_CACHING_MODELS.has(id),
optional: OPTIONAL_PROMPT_CACHING_MODELS.has(id),
},
}
}

Expand Down
27 changes: 24 additions & 3 deletions src/shared/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1401,8 +1401,10 @@ export const vscodeLlmModels = {
* Constants
*/

// These models support reasoning efforts.
export const REASONING_MODELS = new Set(["x-ai/grok-3-mini-beta", "grok-3-mini-beta", "grok-3-mini-fast-beta"])

// These models support prompt caching.
export const PROMPT_CACHING_MODELS = new Set([
"anthropic/claude-3-haiku",
"anthropic/claude-3-haiku:beta",
Expand All @@ -1421,7 +1423,26 @@ export const PROMPT_CACHING_MODELS = new Set([
"anthropic/claude-3.7-sonnet",
"anthropic/claude-3.7-sonnet:beta",
"anthropic/claude-3.7-sonnet:thinking",
// "google/gemini-2.0-flash-001",
// "google/gemini-flash-1.5",
// "google/gemini-flash-1.5-8b",
"google/gemini-2.5-pro-preview-03-25",
"google/gemini-2.0-flash-001",
"google/gemini-flash-1.5",
"google/gemini-flash-1.5-8b",
])

// These models don't have prompt caching enabled by default (you can turn it on
// in settings).
export const OPTIONAL_PROMPT_CACHING_MODELS = new Set([
"google/gemini-2.5-pro-preview-03-25",
"google/gemini-2.0-flash-001",
"google/gemini-flash-1.5",
"google/gemini-flash-1.5-8b",
])

// https://www.anthropic.com/news/3-5-models-and-computer-use
export const COMPUTER_USE_MODELS = new Set([
"anthropic/claude-3.5-sonnet",
"anthropic/claude-3.5-sonnet:beta",
"anthropic/claude-3.7-sonnet",
"anthropic/claude-3.7-sonnet:beta",
"anthropic/claude-3.7-sonnet:thinking",
])