Skip to content

Commit a9ca177

Browse files
OpenRouter Gemini caching (#2847)
* OpenRouter Gemini caching * Fix tests * Remove unsupported models * Clean up the task header a bit * Update src/api/providers/openrouter.ts Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> * Remove model that doesn't seem to work --------- Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
1 parent e53d299 commit a9ca177

File tree

8 files changed

+537
-497
lines changed

8 files changed

+537
-497
lines changed

src/api/providers/__tests__/openrouter.test.ts

Lines changed: 230 additions & 256 deletions
Large diffs are not rendered by default.

src/api/providers/openrouter.ts

Lines changed: 79 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import OpenAI from "openai"
66
import { ApiHandlerOptions, ModelInfo, openRouterDefaultModelId, openRouterDefaultModelInfo } from "../../shared/api"
77
import { parseApiPrice } from "../../utils/cost"
88
import { convertToOpenAiMessages } from "../transform/openai-format"
9-
import { ApiStreamChunk, ApiStreamUsageChunk } from "../transform/stream"
9+
import { ApiStreamChunk } from "../transform/stream"
1010
import { convertToR1Format } from "../transform/r1-format"
1111

1212
import { DEFAULT_HEADERS, DEEP_SEEK_DEFAULT_TEMPERATURE } from "./constants"
@@ -28,6 +28,22 @@ type OpenRouterChatCompletionParams = OpenAI.Chat.ChatCompletionCreateParams & {
2828
}
2929
}
3030

31+
// See `OpenAI.Chat.Completions.ChatCompletionChunk["usage"]`
32+
// `CompletionsAPI.CompletionUsage`
33+
// See also: https://openrouter.ai/docs/use-cases/usage-accounting
34+
interface CompletionUsage {
35+
completion_tokens?: number
36+
completion_tokens_details?: {
37+
reasoning_tokens?: number
38+
}
39+
prompt_tokens?: number
40+
prompt_tokens_details?: {
41+
cached_tokens?: number
42+
}
43+
total_tokens?: number
44+
cost?: number
45+
}
46+
3147
export class OpenRouterHandler extends BaseProvider implements SingleCompletionHandler {
3248
protected options: ApiHandlerOptions
3349
private client: OpenAI
@@ -46,7 +62,15 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
4662
systemPrompt: string,
4763
messages: Anthropic.Messages.MessageParam[],
4864
): AsyncGenerator<ApiStreamChunk> {
49-
let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort } = this.getModel()
65+
let {
66+
id: modelId,
67+
maxTokens,
68+
thinking,
69+
temperature,
70+
supportsPromptCache,
71+
topP,
72+
reasoningEffort,
73+
} = this.getModel()
5074

5175
// Convert Anthropic messages to OpenAI format.
5276
let openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [
@@ -59,46 +83,42 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
5983
openAiMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages])
6084
}
6185

62-
// prompt caching: https://openrouter.ai/docs/prompt-caching
63-
// this is specifically for claude models (some models may 'support prompt caching' automatically without this)
64-
switch (true) {
65-
case modelId.startsWith("anthropic/"):
66-
openAiMessages[0] = {
67-
role: "system",
68-
content: [
69-
{
70-
type: "text",
71-
text: systemPrompt,
72-
// @ts-ignore-next-line
73-
cache_control: { type: "ephemeral" },
74-
},
75-
],
76-
}
86+
// Prompt caching: https://openrouter.ai/docs/prompt-caching
87+
// Now with Gemini support: https://openrouter.ai/docs/features/prompt-caching
88+
if (supportsPromptCache) {
89+
openAiMessages[0] = {
90+
role: "system",
91+
content: [
92+
{
93+
type: "text",
94+
text: systemPrompt,
95+
// @ts-ignore-next-line
96+
cache_control: { type: "ephemeral" },
97+
},
98+
],
99+
}
77100

78-
// Add cache_control to the last two user messages
79-
// (note: this works because we only ever add one user message at a time, but if we added multiple we'd need to mark the user message before the last assistant message)
80-
const lastTwoUserMessages = openAiMessages.filter((msg) => msg.role === "user").slice(-2)
101+
// Add cache_control to the last two user messages
102+
// (note: this works because we only ever add one user message at a time, but if we added multiple we'd need to mark the user message before the last assistant message)
103+
const lastTwoUserMessages = openAiMessages.filter((msg) => msg.role === "user").slice(-2)
81104

82-
lastTwoUserMessages.forEach((msg) => {
83-
if (typeof msg.content === "string") {
84-
msg.content = [{ type: "text", text: msg.content }]
85-
}
105+
lastTwoUserMessages.forEach((msg) => {
106+
if (typeof msg.content === "string") {
107+
msg.content = [{ type: "text", text: msg.content }]
108+
}
86109

87-
if (Array.isArray(msg.content)) {
88-
// NOTE: this is fine since env details will always be added at the end. but if it weren't there, and the user added a image_url type message, it would pop a text part before it and then move it after to the end.
89-
let lastTextPart = msg.content.filter((part) => part.type === "text").pop()
110+
if (Array.isArray(msg.content)) {
111+
// NOTE: this is fine since env details will always be added at the end. but if it weren't there, and the user added a image_url type message, it would pop a text part before it and then move it after to the end.
112+
let lastTextPart = msg.content.filter((part) => part.type === "text").pop()
90113

91-
if (!lastTextPart) {
92-
lastTextPart = { type: "text", text: "..." }
93-
msg.content.push(lastTextPart)
94-
}
95-
// @ts-ignore-next-line
96-
lastTextPart["cache_control"] = { type: "ephemeral" }
114+
if (!lastTextPart) {
115+
lastTextPart = { type: "text", text: "..." }
116+
msg.content.push(lastTextPart)
97117
}
98-
})
99-
break
100-
default:
101-
break
118+
// @ts-ignore-next-line
119+
lastTextPart["cache_control"] = { type: "ephemeral" }
120+
}
121+
})
102122
}
103123

104124
// https://openrouter.ai/docs/transforms
@@ -125,9 +145,9 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
125145

126146
const stream = await this.client.chat.completions.create(completionParams)
127147

128-
let lastUsage
148+
let lastUsage: CompletionUsage | undefined = undefined
129149

130-
for await (const chunk of stream as unknown as AsyncIterable<OpenAI.Chat.Completions.ChatCompletionChunk>) {
150+
for await (const chunk of stream) {
131151
// OpenRouter returns an error object instead of the OpenAI SDK throwing an error.
132152
if ("error" in chunk) {
133153
const error = chunk.error as { message?: string; code?: number }
@@ -137,13 +157,13 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
137157

138158
const delta = chunk.choices[0]?.delta
139159

140-
if ("reasoning" in delta && delta.reasoning) {
141-
yield { type: "reasoning", text: delta.reasoning } as ApiStreamChunk
160+
if ("reasoning" in delta && delta.reasoning && typeof delta.reasoning === "string") {
161+
yield { type: "reasoning", text: delta.reasoning }
142162
}
143163

144164
if (delta?.content) {
145165
fullResponseText += delta.content
146-
yield { type: "text", text: delta.content } as ApiStreamChunk
166+
yield { type: "text", text: delta.content }
147167
}
148168

149169
if (chunk.usage) {
@@ -152,16 +172,16 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
152172
}
153173

154174
if (lastUsage) {
155-
yield this.processUsageMetrics(lastUsage)
156-
}
157-
}
158-
159-
processUsageMetrics(usage: any): ApiStreamUsageChunk {
160-
return {
161-
type: "usage",
162-
inputTokens: usage?.prompt_tokens || 0,
163-
outputTokens: usage?.completion_tokens || 0,
164-
totalCost: usage?.cost || 0,
175+
yield {
176+
type: "usage",
177+
inputTokens: lastUsage.prompt_tokens || 0,
178+
outputTokens: lastUsage.completion_tokens || 0,
179+
// Waiting on OpenRouter to figure out what this represents in the Gemini case
180+
// and how to best support it.
181+
// cacheReadTokens: lastUsage.prompt_tokens_details?.cached_tokens,
182+
reasoningTokens: lastUsage.completion_tokens_details?.reasoning_tokens,
183+
totalCost: lastUsage.cost || 0,
184+
}
165185
}
166186
}
167187

@@ -171,7 +191,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
171191

172192
let id = modelId ?? openRouterDefaultModelId
173193
const info = modelInfo ?? openRouterDefaultModelInfo
174-
194+
const supportsPromptCache = modelInfo?.supportsPromptCache
175195
const isDeepSeekR1 = id.startsWith("deepseek/deepseek-r1") || modelId === "perplexity/sonar-reasoning"
176196
const defaultTemperature = isDeepSeekR1 ? DEEP_SEEK_DEFAULT_TEMPERATURE : 0
177197
const topP = isDeepSeekR1 ? 0.95 : undefined
@@ -180,6 +200,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
180200
id,
181201
info,
182202
...getModelParams({ options: this.options, model: info, defaultTemperature }),
203+
supportsPromptCache,
183204
topP,
184205
}
185206
}
@@ -269,6 +290,11 @@ export async function getOpenRouterModels(options?: ApiHandlerOptions) {
269290
modelInfo.cacheReadsPrice = 0.03
270291
modelInfo.maxTokens = 8192
271292
break
293+
case rawModel.id.startsWith("google/gemini-2.5-pro-preview-03-25"):
294+
case rawModel.id.startsWith("google/gemini-2.0-flash-001"):
295+
case rawModel.id.startsWith("google/gemini-flash-1.5"):
296+
modelInfo.supportsPromptCache = true
297+
break
272298
default:
273299
break
274300
}

src/api/transform/stream.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
export type ApiStream = AsyncGenerator<ApiStreamChunk>
2+
23
export type ApiStreamChunk = ApiStreamTextChunk | ApiStreamUsageChunk | ApiStreamReasoningChunk
34

45
export interface ApiStreamTextChunk {
@@ -17,5 +18,6 @@ export interface ApiStreamUsageChunk {
1718
outputTokens: number
1819
cacheWriteTokens?: number
1920
cacheReadTokens?: number
20-
totalCost?: number // openrouter
21+
reasoningTokens?: number
22+
totalCost?: number
2123
}

webview-ui/src/components/chat/ChatRow.tsx

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import { ReasoningBlock } from "./ReasoningBlock"
2121
import Thumbnails from "../common/Thumbnails"
2222
import McpResourceRow from "../mcp/McpResourceRow"
2323
import McpToolRow from "../mcp/McpToolRow"
24-
import { highlightMentions } from "./TaskHeader"
24+
import { Mention } from "./Mention"
2525
import { CheckpointSaved } from "./checkpoints/CheckpointSaved"
2626
import { FollowUpSuggest } from "./FollowUpSuggest"
2727

@@ -867,7 +867,9 @@ export const ChatRowContent = ({
867867
return (
868868
<div className="bg-vscode-editor-background border rounded-xs p-1 overflow-hidden whitespace-pre-wrap word-break-break-word overflow-wrap-anywhere">
869869
<div className="flex justify-between gap-2">
870-
<div className="flex-grow px-2 py-1">{highlightMentions(message.text)}</div>
870+
<div className="flex-grow px-2 py-1">
871+
<Mention text={message.text} withShadow />
872+
</div>
871873
<Button
872874
variant="ghost"
873875
size="icon"
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import { useMemo } from "react"
2+
import { useTranslation } from "react-i18next"
3+
4+
import { formatLargeNumber } from "@/utils/format"
5+
import { calculateTokenDistribution } from "@/utils/model-utils"
6+
7+
interface ContextWindowProgressProps {
8+
contextWindow: number
9+
contextTokens: number
10+
maxTokens?: number
11+
}
12+
13+
export const ContextWindowProgress = ({ contextWindow, contextTokens, maxTokens }: ContextWindowProgressProps) => {
14+
const { t } = useTranslation()
15+
// Use the shared utility function to calculate all token distribution values
16+
const tokenDistribution = useMemo(
17+
() => calculateTokenDistribution(contextWindow, contextTokens, maxTokens),
18+
[contextWindow, contextTokens, maxTokens],
19+
)
20+
21+
// Destructure the values we need
22+
const { currentPercent, reservedPercent, availableSize, reservedForOutput, availablePercent } = tokenDistribution
23+
24+
// For display purposes
25+
const safeContextWindow = Math.max(0, contextWindow)
26+
const safeContextTokens = Math.max(0, contextTokens)
27+
28+
return (
29+
<>
30+
<div className="flex items-center gap-2 flex-1 whitespace-nowrap px-2">
31+
<div data-testid="context-tokens-count">{formatLargeNumber(safeContextTokens)}</div>
32+
<div className="flex-1 relative">
33+
{/* Invisible overlay for hover area */}
34+
<div
35+
className="absolute w-full h-4 -top-[7px] z-5"
36+
title={t("chat:tokenProgress.availableSpace", { amount: formatLargeNumber(availableSize) })}
37+
data-testid="context-available-space"
38+
/>
39+
40+
{/* Main progress bar container */}
41+
<div className="flex items-center h-1 rounded-[2px] overflow-hidden w-full bg-[color-mix(in_srgb,var(--vscode-foreground)_20%,transparent)]">
42+
{/* Current tokens container */}
43+
<div className="relative h-full" style={{ width: `${currentPercent}%` }}>
44+
{/* Invisible overlay for current tokens section */}
45+
<div
46+
className="absolute h-4 -top-[7px] w-full z-6"
47+
title={t("chat:tokenProgress.tokensUsed", {
48+
used: formatLargeNumber(safeContextTokens),
49+
total: formatLargeNumber(safeContextWindow),
50+
})}
51+
data-testid="context-tokens-used"
52+
/>
53+
{/* Current tokens used - darkest */}
54+
<div className="h-full w-full bg-[var(--vscode-foreground)] transition-width duration-300 ease-out" />
55+
</div>
56+
57+
{/* Container for reserved tokens */}
58+
<div className="relative h-full" style={{ width: `${reservedPercent}%` }}>
59+
{/* Invisible overlay for reserved section */}
60+
<div
61+
className="absolute h-4 -top-[7px] w-full z-6"
62+
title={t("chat:tokenProgress.reservedForResponse", {
63+
amount: formatLargeNumber(reservedForOutput),
64+
})}
65+
data-testid="context-reserved-tokens"
66+
/>
67+
{/* Reserved for output section - medium gray */}
68+
<div className="h-full w-full bg-[color-mix(in_srgb,var(--vscode-foreground)_30%,transparent)] transition-width duration-300 ease-out" />
69+
</div>
70+
71+
{/* Empty section (if any) */}
72+
{availablePercent > 0 && (
73+
<div className="relative h-full" style={{ width: `${availablePercent}%` }}>
74+
{/* Invisible overlay for available space */}
75+
<div
76+
className="absolute h-4 -top-[7px] w-full z-6"
77+
title={t("chat:tokenProgress.availableSpace", {
78+
amount: formatLargeNumber(availableSize),
79+
})}
80+
data-testid="context-available-space-section"
81+
/>
82+
</div>
83+
)}
84+
</div>
85+
</div>
86+
<div data-testid="context-window-size">{formatLargeNumber(safeContextWindow)}</div>
87+
</div>
88+
</>
89+
)
90+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
import { mentionRegexGlobal } from "@roo/shared/context-mentions"
2+
3+
import { vscode } from "../../utils/vscode"
4+
5+
interface MentionProps {
6+
text?: string
7+
withShadow?: boolean
8+
}
9+
10+
export const Mention = ({ text, withShadow = false }: MentionProps) => {
11+
if (!text) {
12+
return <>{text}</>
13+
}
14+
15+
const parts = text.split(mentionRegexGlobal).map((part, index) => {
16+
if (index % 2 === 0) {
17+
// This is regular text.
18+
return part
19+
} else {
20+
// This is a mention.
21+
return (
22+
<span
23+
key={index}
24+
className={`${withShadow ? "mention-context-highlight-with-shadow" : "mention-context-highlight"} cursor-pointer`}
25+
onClick={() => vscode.postMessage({ type: "openMention", text: part })}>
26+
@{part}
27+
</span>
28+
)
29+
}
30+
})
31+
32+
return <>{parts}</>
33+
}

0 commit comments

Comments
 (0)