Skip to content

Commit 726c863

Browse files
committed
OpenRouter Gemini caching
1 parent 0d561f8 commit 726c863

File tree

1 file changed

+87
-49
lines changed

1 file changed

+87
-49
lines changed

src/api/providers/openrouter.ts

Lines changed: 87 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,25 @@ type OpenRouterChatCompletionParams = OpenAI.Chat.ChatCompletionCreateParams & {
2828
}
2929
}
3030

31+
// See `OpenAI.Chat.Completions.ChatCompletionChunk["usage"]`
32+
// `CompletionsAPI.CompletionUsage`
33+
interface CompletionUsage {
34+
completion_tokens?: number
35+
prompt_tokens?: number
36+
total_tokens?: number
37+
cost?: number
38+
39+
/**
40+
* Breakdown of tokens used in a completion.
41+
*/
42+
// completion_tokens_details?: CompletionUsage.CompletionTokensDetails;
43+
44+
/**
45+
* Breakdown of tokens used in the prompt.
46+
*/
47+
// prompt_tokens_details?: CompletionUsage.PromptTokensDetails;
48+
}
49+
3150
export class OpenRouterHandler extends BaseProvider implements SingleCompletionHandler {
3251
protected options: ApiHandlerOptions
3352
private client: OpenAI
@@ -46,7 +65,15 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
4665
systemPrompt: string,
4766
messages: Anthropic.Messages.MessageParam[],
4867
): AsyncGenerator<ApiStreamChunk> {
49-
let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort } = this.getModel()
68+
let {
69+
id: modelId,
70+
maxTokens,
71+
thinking,
72+
temperature,
73+
supportsPromptCache,
74+
topP,
75+
reasoningEffort,
76+
} = this.getModel()
5077

5178
// Convert Anthropic messages to OpenAI format.
5279
let openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
@@ -59,46 +86,42 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
5986
openAiMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
6087
}
6188

62-
// prompt caching: https://openrouter.ai/docs/prompt-caching
63-
// this is specifically for claude models (some models may 'support prompt caching' automatically without this)
64-
switch (true) {
65-
case modelId.startsWith("anthropic/"):
66-
openAiMessages[0] = {
67-
role: "system",
68-
content: [
69-
{
70-
type: "text",
71-
text: systemPrompt,
72-
// @ts-ignore-next-line
73-
cache_control: { type: "ephemeral" },
74-
},
75-
],
76-
}
89+
// Prompt caching: https://openrouter.ai/docs/prompt-caching
90+
// Now with Gemini support: https://openrouter.ai/docs/features/prompt-caching
91+
if (supportsPromptCache) {
92+
openAiMessages[0] = {
93+
role: "system",
94+
content: [
95+
{
96+
type: "text",
97+
text: systemPrompt,
98+
// @ts-ignore-next-line
99+
cache_control: { type: "ephemeral" },
100+
},
101+
],
102+
}
77103

78-
// Add cache_control to the last two user messages
79-
// (note: this works because we only ever add one user message at a time, but if we added multiple we'd need to mark the user message before the last assistant message)
80-
const lastTwoUserMessages = openAiMessages.filter((msg) => msg.role === "user").slice(-2)
104+
// Add cache_control to the last two user messages
105+
// (note: this works because we only ever add one user message at a time, but if we added multiple we'd need to mark the user message before the last assistant message)
106+
const lastTwoUserMessages = openAiMessages.filter((msg) => msg.role === "user").slice(-2)
81107

82-
lastTwoUserMessages.forEach((msg) => {
83-
if (typeof msg.content === "string") {
84-
msg.content = [{ type: "text", text: msg.content }]
85-
}
108+
lastTwoUserMessages.forEach((msg) => {
109+
if (typeof msg.content === "string") {
110+
msg.content = [{ type: "text", text: msg.content }]
111+
}
86112

87-
if (Array.isArray(msg.content)) {
88-
// NOTE: this is fine since env details will always be added at the end. but if it weren't there, and the user added a image_url type message, it would pop a text part before it and then move it after to the end.
89-
let lastTextPart = msg.content.filter((part) => part.type === "text").pop()
113+
if (Array.isArray(msg.content)) {
114+
// NOTE: this is fine since env details will always be added at the end. but if it weren't there, and the user added a image_url type message, it would pop a text part before it and then move it after to the end.
115+
let lastTextPart = msg.content.filter((part) => part.type === "text").pop()
90116

91-
if (!lastTextPart) {
92-
lastTextPart = { type: "text", text: "..." }
93-
msg.content.push(lastTextPart)
94-
}
95-
// @ts-ignore-next-line
96-
lastTextPart["cache_control"] = { type: "ephemeral" }
117+
if (!lastTextPart) {
118+
lastTextPart = { type: "text", text: "..." }
119+
msg.content.push(lastTextPart)
97120
}
98-
})
99-
break
100-
default:
101-
break
121+
// @ts-ignore-next-line
122+
lastTextPart["cache_control"] = { type: "ephemeral" }
123+
}
124+
})
102125
}
103126

104127
// https://openrouter.ai/docs/transforms
@@ -125,9 +148,9 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
125148

126149
const stream = await this.client.chat.completions.create(completionParams)
127150

128-
let lastUsage
151+
let lastUsage: CompletionUsage | undefined = undefined
129152

130-
for await (const chunk of stream as unknown as AsyncIterable<OpenAI.Chat.Completions.ChatCompletionChunk>) {
153+
for await (const chunk of stream) {
131154
// OpenRouter returns an error object instead of the OpenAI SDK throwing an error.
132155
if ("error" in chunk) {
133156
const error = chunk.error as { message?: string; code?: number }
@@ -152,16 +175,12 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
152175
}
153176

154177
if (lastUsage) {
155-
yield this.processUsageMetrics(lastUsage)
156-
}
157-
}
158-
159-
processUsageMetrics(usage: any): ApiStreamUsageChunk {
160-
return {
161-
type: "usage",
162-
inputTokens: usage?.prompt_tokens || 0,
163-
outputTokens: usage?.completion_tokens || 0,
164-
totalCost: usage?.cost || 0,
178+
yield {
179+
type: "usage",
180+
inputTokens: lastUsage.prompt_tokens || 0,
181+
outputTokens: lastUsage.completion_tokens || 0,
182+
totalCost: lastUsage?.cost || 0,
183+
}
165184
}
166185
}
167186

@@ -171,7 +190,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
171190

172191
let id = modelId ?? openRouterDefaultModelId
173192
const info = modelInfo ?? openRouterDefaultModelInfo
174-
193+
const supportsPromptCache = modelInfo?.supportsPromptCache
175194
const isDeepSeekR1 = id.startsWith("deepseek/deepseek-r1") || modelId === "perplexity/sonar-reasoning"
176195
const defaultTemperature = isDeepSeekR1 ? DEEP_SEEK_DEFAULT_TEMPERATURE : 0
177196
const topP = isDeepSeekR1 ? 0.95 : undefined
@@ -180,6 +199,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
180199
id,
181200
info,
182201
...getModelParams({ options: this.options, model: info, defaultTemperature }),
202+
supportsPromptCache,
183203
topP,
184204
}
185205
}
@@ -269,6 +289,24 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
269289
modelInfo.cacheReadsPrice = 0.03
270290
modelInfo.maxTokens = 8192
271291
break
292+
// case rawModel.id.startsWith("google/gemini-2.5-flash-preview"):
293+
// modelInfo.supportsPromptCache = true
294+
// break
295+
case rawModel.id.startsWith("google/gemini-2.5-pro-preview-03-25"):
296+
modelInfo.supportsPromptCache = true
297+
break
298+
case rawModel.id.startsWith("google/gemini-2.0-flash-001"):
299+
modelInfo.supportsPromptCache = true
300+
break
301+
// case rawModel.id.startsWith("google/gemini-2.0-flash-lite-001"):
302+
// modelInfo.supportsPromptCache = true
303+
// break
304+
case rawModel.id.startsWith("google/gemini-flash-1.5"):
305+
modelInfo.supportsPromptCache = true
306+
break
307+
case rawModel.id.startsWith("google/gemini-pro-1.5"):
308+
modelInfo.supportsPromptCache = true
309+
break
272310
default:
273311
break
274312
}

0 commit comments

Comments
 (0)