Skip to content

Commit fe5c8ff

Browse files
committed
Gemini OpenRouter caching
1 parent 40bf9e7 commit fe5c8ff

File tree

9 files changed

+351
-95
lines changed

9 files changed

+351
-95
lines changed

src/api/providers/anthropic-vertex.ts

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@ import { AnthropicVertex } from "@anthropic-ai/vertex-sdk"
33
import { GoogleAuth, JWTInput } from "google-auth-library"
44

55
import { ApiHandlerOptions, ModelInfo, vertexDefaultModelId, VertexModelId, vertexModels } from "../../shared/api"
6-
import { ApiStream } from "../transform/stream"
76
import { safeJsonParse } from "../../shared/safeJsonParse"
87

8+
import { ApiStream } from "../transform/stream"
9+
import { addCacheBreakpoints } from "../transform/caching/vertex"
10+
911
import { getModelParams, SingleCompletionHandler } from "../index"
10-
import { BaseProvider } from "./base-provider"
1112
import { ANTHROPIC_DEFAULT_MAX_TOKENS } from "./constants"
12-
import { formatMessageForCache } from "../transform/vertex-caching"
13+
import { BaseProvider } from "./base-provider"
1314

1415
// https://docs.anthropic.com/en/api/claude-on-vertex-ai
1516
export class AnthropicVertexHandler extends BaseProvider implements SingleCompletionHandler {
@@ -57,16 +58,6 @@ export class AnthropicVertexHandler extends BaseProvider implements SingleComple
5758
thinking,
5859
} = this.getModel()
5960

60-
// Find indices of user messages that we want to cache
61-
// We only cache the last two user messages to stay within the 4-block limit
62-
// (1 block for system + 1 block each for last two user messages = 3 total)
63-
const userMsgIndices = supportsPromptCache
64-
? messages.reduce((acc, msg, i) => (msg.role === "user" ? [...acc, i] : acc), [] as number[])
65-
: []
66-
67-
const lastUserMsgIndex = userMsgIndices[userMsgIndices.length - 1] ?? -1
68-
const secondLastMsgUserIndex = userMsgIndices[userMsgIndices.length - 2] ?? -1
69-
7061
/**
7162
* Vertex API has specific limitations for prompt caching:
7263
* 1. Maximum of 4 blocks can have cache_control
@@ -89,12 +80,7 @@ export class AnthropicVertexHandler extends BaseProvider implements SingleComple
8980
system: supportsPromptCache
9081
? [{ text: systemPrompt, type: "text" as const, cache_control: { type: "ephemeral" } }]
9182
: systemPrompt,
92-
messages: messages.map((message, index) => {
93-
// Only cache the last two user messages.
94-
const shouldCache =
95-
supportsPromptCache && (index === lastUserMsgIndex || index === secondLastMsgUserIndex)
96-
return formatMessageForCache(message, shouldCache)
97-
}),
83+
messages: supportsPromptCache ? addCacheBreakpoints(messages) : messages,
9884
stream: true,
9985
}
10086

src/api/providers/glama.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@ import axios from "axios"
33
import OpenAI from "openai"
44

55
import { ApiHandlerOptions, glamaDefaultModelId, glamaDefaultModelInfo } from "../../shared/api"
6+
67
import { ApiStream } from "../transform/stream"
78
import { convertToOpenAiMessages } from "../transform/openai-format"
8-
import { addCacheControlDirectives } from "../transform/caching"
9+
import { addCacheBreakpoints } from "../transform/caching/anthropic"
10+
911
import { SingleCompletionHandler } from "../index"
1012
import { RouterProvider } from "./router-provider"
1113

@@ -37,7 +39,7 @@ export class GlamaHandler extends RouterProvider implements SingleCompletionHand
3739
]
3840

3941
if (modelId.startsWith("anthropic/claude-3")) {
40-
addCacheControlDirectives(systemPrompt, openAiMessages)
42+
addCacheBreakpoints(systemPrompt, openAiMessages)
4143
}
4244

4345
// Required by Anthropic; other providers default to max tokens allowed.

src/api/providers/openrouter.ts

Lines changed: 7 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ import {
1111
OPTIONAL_PROMPT_CACHING_MODELS,
1212
REASONING_MODELS,
1313
} from "../../shared/api"
14+
1415
import { convertToOpenAiMessages } from "../transform/openai-format"
1516
import { ApiStreamChunk } from "../transform/stream"
1617
import { convertToR1Format } from "../transform/r1-format"
18+
import { addCacheBreakpoints as addAnthropicCacheBreakpoints } from "../transform/caching/anthropic"
19+
import { addCacheBreakpoints as addGeminiCacheBreakpoints } from "../transform/caching/gemini"
1720

1821
import { getModelParams, SingleCompletionHandler } from "../index"
1922
import { DEFAULT_HEADERS, DEEP_SEEK_DEFAULT_TEMPERATURE } from "./constants"
@@ -93,42 +96,11 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH
9396

9497
const isCacheAvailable = promptCache.supported && (!promptCache.optional || this.options.promptCachingEnabled)
9598

96-
// Prompt caching: https://openrouter.ai/docs/prompt-caching
97-
// Now with Gemini support: https://openrouter.ai/docs/features/prompt-caching
98-
// Note that we don't check the `ModelInfo` object because it is cached
99-
// in the settings for OpenRouter and the value could be stale.
99+
// https://openrouter.ai/docs/features/prompt-caching
100100
if (isCacheAvailable) {
101-
openAiMessages[0] = {
102-
role: "system",
103-
// @ts-ignore-next-line
104-
content: [{ type: "text", text: systemPrompt, cache_control: { type: "ephemeral" } }],
105-
}
106-
107-
// Add cache_control to the last two user messages
108-
// (note: this works because we only ever add one user message at a time, but if we added multiple we'd need to mark the user message before the last assistant message)
109-
const lastTwoUserMessages = openAiMessages.filter((msg) => msg.role === "user").slice(-2)
110-
111-
lastTwoUserMessages.forEach((msg) => {
112-
if (typeof msg.content === "string") {
113-
msg.content = [{ type: "text", text: msg.content }]
114-
}
115-
116-
if (Array.isArray(msg.content)) {
117-
// NOTE: This is fine since env details will always be added
118-
// at the end. But if it wasn't there, and the user added a
119-
// image_url type message, it would pop a text part before
120-
// it and then move it after to the end.
121-
let lastTextPart = msg.content.filter((part) => part.type === "text").pop()
122-
123-
if (!lastTextPart) {
124-
lastTextPart = { type: "text", text: "..." }
125-
msg.content.push(lastTextPart)
126-
}
127-
128-
// @ts-ignore-next-line
129-
lastTextPart["cache_control"] = { type: "ephemeral" }
130-
}
131-
})
101+
modelId.startsWith("google")
102+
? addGeminiCacheBreakpoints(systemPrompt, openAiMessages, 10) // Pass frequency
103+
: addAnthropicCacheBreakpoints(systemPrompt, openAiMessages)
132104
}
133105

134106
// https://openrouter.ai/docs/transforms

src/api/providers/unbound.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@ import { Anthropic } from "@anthropic-ai/sdk"
22
import OpenAI from "openai"
33

44
import { ApiHandlerOptions, unboundDefaultModelId, unboundDefaultModelInfo } from "../../shared/api"
5+
56
import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
67
import { convertToOpenAiMessages } from "../transform/openai-format"
7-
import { addCacheControlDirectives } from "../transform/caching"
8+
import { addCacheBreakpoints } from "../transform/caching/anthropic"
9+
810
import { SingleCompletionHandler } from "../index"
911
import { RouterProvider } from "./router-provider"
1012

@@ -39,7 +41,7 @@ export class UnboundHandler extends RouterProvider implements SingleCompletionHa
3941
]
4042

4143
if (modelId.startsWith("anthropic/claude-3")) {
42-
addCacheControlDirectives(systemPrompt, openAiMessages)
44+
addCacheBreakpoints(systemPrompt, openAiMessages)
4345
}
4446

4547
// Required by Anthropic; other providers default to max tokens allowed.

src/api/transform/caching.ts

Lines changed: 0 additions & 36 deletions
This file was deleted.

0 commit comments

Comments
 (0)