Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 1 addition & 8 deletions src/api/providers/__tests__/gemini.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,7 @@ describe("GeminiHandler", () => {
expect(chunks.length).toBe(3)
expect(chunks[0]).toEqual({ type: "text", text: "Hello" })
expect(chunks[1]).toEqual({ type: "text", text: " world!" })
expect(chunks[2]).toEqual({
type: "usage",
inputTokens: 10,
outputTokens: 5,
cacheReadTokens: undefined,
cacheWriteTokens: undefined,
thinkingTokens: undefined,
})
expect(chunks[2]).toEqual({ type: "usage", inputTokens: 10, outputTokens: 5 })

// Verify the call to generateContentStream
expect(handler["client"].models.generateContentStream).toHaveBeenCalledWith(
Expand Down
8 changes: 6 additions & 2 deletions src/api/providers/__tests__/openrouter.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,14 @@ describe("OpenRouterHandler", () => {
id: mockOptions.openRouterModelId,
info: mockOptions.openRouterModelInfo,
maxTokens: 1000,
reasoning: undefined,
temperature: 0,
thinking: undefined,
temperature: 0,
reasoningEffort: undefined,
topP: undefined,
promptCache: {
supported: false,
optional: false,
},
})
})

Expand Down

Large diffs are not rendered by default.

27 changes: 26 additions & 1 deletion src/api/providers/fetchers/__tests__/openrouter.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import { PROMPT_CACHING_MODELS } from "../../../../shared/api"
import { getOpenRouterModels } from "../openrouter"

nockBack.fixtures = path.join(__dirname, "fixtures")
nockBack.setMode("dryrun")
nockBack.setMode("lockdown")

describe("OpenRouter API", () => {
describe("getOpenRouterModels", () => {
Expand Down Expand Up @@ -66,6 +66,31 @@ describe("OpenRouter API", () => {
supportsComputerUse: true,
})

expect(
Object.entries(models)
.filter(([id, _]) => id.startsWith("anthropic/claude-3"))
.map(([id, model]) => ({ id, maxTokens: model.maxTokens }))
.sort(({ id: a }, { id: b }) => a.localeCompare(b)),
).toEqual([
{ id: "anthropic/claude-3-haiku", maxTokens: 4096 },
{ id: "anthropic/claude-3-haiku:beta", maxTokens: 4096 },
{ id: "anthropic/claude-3-opus", maxTokens: 4096 },
{ id: "anthropic/claude-3-opus:beta", maxTokens: 4096 },
{ id: "anthropic/claude-3-sonnet", maxTokens: 4096 },
{ id: "anthropic/claude-3-sonnet:beta", maxTokens: 4096 },
{ id: "anthropic/claude-3.5-haiku", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-haiku-20241022", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-haiku-20241022:beta", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-haiku:beta", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-sonnet", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-sonnet-20240620", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-sonnet-20240620:beta", maxTokens: 8192 },
{ id: "anthropic/claude-3.5-sonnet:beta", maxTokens: 8192 },
{ id: "anthropic/claude-3.7-sonnet", maxTokens: 8192 },
{ id: "anthropic/claude-3.7-sonnet:beta", maxTokens: 8192 },
{ id: "anthropic/claude-3.7-sonnet:thinking", maxTokens: 128000 },
])

nockDone()
})
})
Expand Down
54 changes: 28 additions & 26 deletions src/api/providers/fetchers/openrouter.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import axios from "axios"
import { z } from "zod"

import { ApiHandlerOptions, ModelInfo } from "../../../shared/api"
import {
ApiHandlerOptions,
ModelInfo,
anthropicModels,
COMPUTER_USE_MODELS,
OPTIONAL_PROMPT_CACHING_MODELS,
} from "../../../shared/api"
import { parseApiPrice } from "../../../utils/cost"

// https://openrouter.ai/api/v1/models
Expand Down Expand Up @@ -62,8 +68,8 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
? parseApiPrice(rawModel.pricing?.input_cache_read)
: undefined

// Disable prompt caching for Gemini models for now.
const supportsPromptCache = !!cacheWritesPrice && !!cacheReadsPrice && !rawModel.id.startsWith("google")
const supportsPromptCache =
typeof cacheWritesPrice !== "undefined" && typeof cacheReadsPrice !== "undefined"

const modelInfo: ModelInfo = {
maxTokens: rawModel.top_provider?.max_completion_tokens,
Expand All @@ -78,29 +84,25 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
thinking: rawModel.id === "anthropic/claude-3.7-sonnet:thinking",
}

// Then OpenRouter model definition doesn't give us any hints about computer use,
// so we need to set that manually.
// The ideal `maxTokens` values are model dependent, but we should probably DRY
// this up and use the values defined for the Anthropic providers.
switch (true) {
case rawModel.id.startsWith("anthropic/claude-3.7-sonnet"):
modelInfo.supportsComputerUse = true
modelInfo.maxTokens = rawModel.id === "anthropic/claude-3.7-sonnet:thinking" ? 128_000 : 8192
break
case rawModel.id.startsWith("anthropic/claude-3.5-sonnet-20240620"):
modelInfo.maxTokens = 8192
break
case rawModel.id.startsWith("anthropic/claude-3.5-sonnet"):
modelInfo.supportsComputerUse = true
modelInfo.maxTokens = 8192
break
case rawModel.id.startsWith("anthropic/claude-3-5-haiku"):
case rawModel.id.startsWith("anthropic/claude-3-opus"):
case rawModel.id.startsWith("anthropic/claude-3-haiku"):
modelInfo.maxTokens = 8192
break
default:
break
// The OpenRouter model definition doesn't give us any hints about
// computer use, so we need to set that manually.
if (COMPUTER_USE_MODELS.has(rawModel.id)) {
modelInfo.supportsComputerUse = true
}

// We want to treat prompt caching as "experimental" for these models.
if (OPTIONAL_PROMPT_CACHING_MODELS.has(rawModel.id)) {
modelInfo.isPromptCacheOptional = true
}

// Claude 3.7 Sonnet is a "hybrid" thinking model, and the `maxTokens`
// values can be configured. For the non-thinking variant we want to
// use 8k. The `thinking` variant can be run in 64k and 128k modes,
// and we want to use 128k.
if (rawModel.id.startsWith("anthropic/claude-3.7-sonnet")) {
modelInfo.maxTokens = rawModel.id.includes("thinking")
? anthropicModels["claude-3-7-sonnet-20250219:thinking"].maxTokens
: anthropicModels["claude-3-7-sonnet-20250219"].maxTokens
}

models[rawModel.id] = modelInfo
Expand Down
64 changes: 37 additions & 27 deletions src/api/providers/gemini.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl

let uncachedContent: Content[] | undefined = undefined
let cachedContent: string | undefined = undefined
let cacheWriteTokens: number | undefined = undefined

// The minimum input token count for context caching is 4,096.
// For a basic approximation we assume 4 characters per token.
Expand All @@ -67,6 +66,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
cacheKey &&
contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM

let cacheWrite = false

if (isCacheAvailable) {
const cacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)

Expand Down Expand Up @@ -97,9 +98,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl

if (name) {
this.contentCaches.set<CacheEntry>(cacheKey, { key: name, count: contents.length })
cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0
console.log(
`[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`,
`[GeminiHandler] cached ${contents.length} messages (${usageMetadata?.totalTokenCount ?? "-"} tokens) in ${Date.now() - timestamp}ms`,
)
}
})
Expand All @@ -109,6 +109,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
.finally(() => {
this.isCacheBusy = false
})

cacheWrite = true
}
}

Expand Down Expand Up @@ -146,27 +148,24 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
if (lastUsageMetadata) {
const inputTokens = lastUsageMetadata.promptTokenCount ?? 0
const outputTokens = lastUsageMetadata.candidatesTokenCount ?? 0
const cacheWriteTokens = cacheWrite ? inputTokens : undefined
const cacheReadTokens = lastUsageMetadata.cachedContentTokenCount
const reasoningTokens = lastUsageMetadata.thoughtsTokenCount

const totalCost = isCacheUsed
? this.calculateCost({
info,
inputTokens,
outputTokens,
cacheWriteTokens,
cacheReadTokens,
})
: undefined

yield {
type: "usage",
inputTokens,
outputTokens,
cacheWriteTokens,
cacheReadTokens,
reasoningTokens,
totalCost,
totalCost: this.calculateCost({
info,
inputTokens,
outputTokens,
cacheWriteTokens,
cacheReadTokens,
}),
}
}
}
Expand Down Expand Up @@ -250,8 +249,8 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
info,
inputTokens,
outputTokens,
cacheWriteTokens,
cacheReadTokens,
cacheWriteTokens = 0,
cacheReadTokens = 0,
}: {
info: ModelInfo
inputTokens: number
Expand Down Expand Up @@ -281,21 +280,32 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
}
}

let inputTokensCost = inputPrice * (inputTokens / 1_000_000)
let outputTokensCost = outputPrice * (outputTokens / 1_000_000)
let cacheWriteCost = 0
let cacheReadCost = 0
// Subtract the cached input tokens from the total input tokens.
const uncachedInputTokens = inputTokens - cacheReadTokens

if (cacheWriteTokens) {
cacheWriteCost = cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60)
let cacheWriteCost =
cacheWriteTokens > 0 ? cacheWritesPrice * (cacheWriteTokens / 1_000_000) * (CACHE_TTL / 60) : 0
let cacheReadCost = cacheReadTokens > 0 ? cacheReadsPrice * (cacheReadTokens / 1_000_000) : 0

const inputTokensCost = inputPrice * (uncachedInputTokens / 1_000_000)
const outputTokensCost = outputPrice * (outputTokens / 1_000_000)
const totalCost = inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost

const trace: Record<string, { price: number; tokens: number; cost: number }> = {
input: { price: inputPrice, tokens: uncachedInputTokens, cost: inputTokensCost },
output: { price: outputPrice, tokens: outputTokens, cost: outputTokensCost },
}

if (cacheReadTokens) {
const uncachedReadTokens = inputTokens - cacheReadTokens
cacheReadCost = cacheReadsPrice * (cacheReadTokens / 1_000_000)
inputTokensCost = inputPrice * (uncachedReadTokens / 1_000_000)
if (cacheWriteTokens > 0) {
trace.cacheWrite = { price: cacheWritesPrice, tokens: cacheWriteTokens, cost: cacheWriteCost }
}

return inputTokensCost + outputTokensCost + cacheWriteCost + cacheReadCost
if (cacheReadTokens > 0) {
trace.cacheRead = { price: cacheReadsPrice, tokens: cacheReadTokens, cost: cacheReadCost }
}

// console.log(`[GeminiHandler] calculateCost -> ${totalCost}`, trace)

return totalCost
}
}
12 changes: 10 additions & 2 deletions src/api/providers/openrouter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import {
openRouterDefaultModelId,
openRouterDefaultModelInfo,
PROMPT_CACHING_MODELS,
OPTIONAL_PROMPT_CACHING_MODELS,
} from "../../shared/api"
import { convertToOpenAiMessages } from "../transform/openai-format"
import { ApiStreamChunk } from "../transform/stream"
Expand Down Expand Up @@ -65,7 +66,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
systemPrompt: string,
messages: Anthropic.Messages.MessageParam[],
): AsyncGenerator<ApiStreamChunk> {
let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort, info } = this.getModel()
let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort, promptCache } = this.getModel()

// Convert Anthropic messages to OpenAI format.
let openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
Expand All @@ -78,11 +79,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
openAiMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
}

const isCacheAvailable = promptCache.supported && (!promptCache.optional || this.options.promptCachingEnabled)

// Prompt caching: https://openrouter.ai/docs/prompt-caching
// Now with Gemini support: https://openrouter.ai/docs/features/prompt-caching
// Note that we don't check the `ModelInfo` object because it is cached
// in the settings for OpenRouter and the value could be stale.
if (PROMPT_CACHING_MODELS.has(modelId)) {
if (isCacheAvailable) {
openAiMessages[0] = {
role: "system",
// @ts-ignore-next-line
Expand Down Expand Up @@ -193,8 +196,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
return {
id,
info,
// maxTokens, thinking, temperature, reasoningEffort
...getModelParams({ options: this.options, model: info, defaultTemperature }),
topP,
promptCache: {
supported: PROMPT_CACHING_MODELS.has(id),
optional: OPTIONAL_PROMPT_CACHING_MODELS.has(id),
},
}
}

Expand Down
27 changes: 24 additions & 3 deletions src/shared/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1401,8 +1401,10 @@ export const vscodeLlmModels = {
* Constants
*/

// These models support reasoning efforts.
export const REASONING_MODELS = new Set(["x-ai/grok-3-mini-beta", "grok-3-mini-beta", "grok-3-mini-fast-beta"])

// These models support prompt caching.
export const PROMPT_CACHING_MODELS = new Set([
"anthropic/claude-3-haiku",
"anthropic/claude-3-haiku:beta",
Expand All @@ -1421,7 +1423,26 @@ export const PROMPT_CACHING_MODELS = new Set([
"anthropic/claude-3.7-sonnet",
"anthropic/claude-3.7-sonnet:beta",
"anthropic/claude-3.7-sonnet:thinking",
// "google/gemini-2.0-flash-001",
// "google/gemini-flash-1.5",
// "google/gemini-flash-1.5-8b",
"google/gemini-2.5-pro-preview-03-25",
"google/gemini-2.0-flash-001",
"google/gemini-flash-1.5",
"google/gemini-flash-1.5-8b",
])

// These models don't have prompt caching enabled by default (you can turn it on
// in settings).
export const OPTIONAL_PROMPT_CACHING_MODELS = new Set([
"google/gemini-2.5-pro-preview-03-25",
"google/gemini-2.0-flash-001",
"google/gemini-flash-1.5",
"google/gemini-flash-1.5-8b",
])

// https://www.anthropic.com/news/3-5-models-and-computer-use
export const COMPUTER_USE_MODELS = new Set([
"anthropic/claude-3.5-sonnet",
"anthropic/claude-3.5-sonnet:beta",
"anthropic/claude-3.7-sonnet",
"anthropic/claude-3.7-sonnet:beta",
"anthropic/claude-3.7-sonnet:thinking",
])