Skip to content

Commit 2359aa4

Browse files
committed
Gemini prompt caching
1 parent f06567d commit 2359aa4

File tree

5 files changed

+77
-33
lines changed

5 files changed

+77
-33
lines changed

src/api/index.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ export interface SingleCompletionHandler {
2828
}
2929

3030
export interface ApiHandler {
31-
createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream
31+
createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[], taskId?: string): ApiStream
32+
3233
getModel(): { id: string; info: ModelInfo }
3334

3435
/**

src/api/providers/anthropic.ts

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,14 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
4242
case "claude-3-opus-20240229":
4343
case "claude-3-haiku-20240307": {
4444
/**
45-
* The latest message will be the new user message, one before will
46-
* be the assistant message from a previous request, and the user message before that will be a previously cached user message. So we need to mark the latest user message as ephemeral to cache it for the next request, and mark the second to last user message as ephemeral to let the server know the last message to retrieve from the cache for the current request..
45+
* The latest message will be the new user message, one before
46+
* will be the assistant message from a previous request, and
47+
* the user message before that will be a previously cached user
48+
* message. So we need to mark the latest user message as
49+
* ephemeral to cache it for the next request, and mark the
50+
* second to last user message as ephemeral to let the server
51+
* know the last message to retrieve from the cache for the
52+
* current request.
4753
*/
4854
const userMsgIndices = messages.reduce(
4955
(acc, msg, index) => (msg.role === "user" ? [...acc, index] : acc),
@@ -77,9 +83,6 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
7783
}
7884
return message
7985
}),
80-
// tools, // cache breakpoints go from tools > system > messages, and since tools dont change, we can just set the breakpoint at the end of system (this avoids having to set a breakpoint at the end of tools which by itself does not meet min requirements for haiku caching)
81-
// tool_choice: { type: "auto" },
82-
// tools: tools,
8386
stream: true,
8487
},
8588
(() => {
@@ -102,9 +105,7 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
102105
case "claude-3-opus-20240229":
103106
case "claude-3-haiku-20240307":
104107
betas.push("prompt-caching-2024-07-31")
105-
return {
106-
headers: { "anthropic-beta": betas.join(",") },
107-
}
108+
return { headers: { "anthropic-beta": betas.join(",") } }
108109
default:
109110
return undefined
110111
}
@@ -119,8 +120,6 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
119120
temperature,
120121
system: [{ text: systemPrompt, type: "text" }],
121122
messages,
122-
// tools,
123-
// tool_choice: { type: "auto" },
124123
stream: true,
125124
})) as any
126125
break

src/api/providers/gemini.ts

Lines changed: 60 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import {
44
ThinkingConfig,
55
type GenerateContentResponseUsageMetadata,
66
type GenerateContentParameters,
7+
type Content,
78
} from "@google/genai"
89

910
import { SingleCompletionHandler } from "../"
@@ -16,27 +17,58 @@ import { BaseProvider } from "./base-provider"
1617
export class GeminiHandler extends BaseProvider implements SingleCompletionHandler {
1718
protected options: ApiHandlerOptions
1819
private client: GoogleGenAI
20+
private contentCaches: Map<string, string>
1921

2022
constructor(options: ApiHandlerOptions) {
2123
super()
2224
this.options = options
2325
this.client = new GoogleGenAI({ apiKey: options.geminiApiKey ?? "not-provided" })
26+
this.contentCaches = new Map()
2427
}
2528

26-
async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
27-
const { id: model, thinkingConfig, maxOutputTokens } = this.getModel()
29+
async *createMessage(
30+
systemInstruction: string,
31+
messages: Anthropic.Messages.MessageParam[],
32+
taskId?: string,
33+
): ApiStream {
34+
const { id: model, thinkingConfig, maxOutputTokens, supportsPromptCache } = this.getModel()
35+
36+
const contents = messages.map(convertAnthropicMessageToGemini)
37+
let uncachedContent: Content | undefined = undefined
38+
let cachedContent: string | undefined = undefined
39+
let cacheWriteTokens: number = 0
40+
41+
// https://ai.google.dev/gemini-api/docs/caching?lang=node
42+
if (supportsPromptCache && taskId) {
43+
cachedContent = this.contentCaches.get(taskId)
44+
45+
if (cachedContent) {
46+
uncachedContent = convertAnthropicMessageToGemini(messages[messages.length - 1])
47+
}
48+
49+
const updatedCachedContent = await this.client.caches.create({
50+
model,
51+
config: { contents, systemInstruction, ttl: "300s" },
52+
})
53+
54+
if (updatedCachedContent.name) {
55+
this.contentCaches.set(taskId, updatedCachedContent.name)
56+
cacheWriteTokens = updatedCachedContent.usageMetadata?.totalTokenCount ?? 0
57+
}
58+
}
2859

2960
const params: GenerateContentParameters = {
3061
model,
31-
contents: messages.map(convertAnthropicMessageToGemini),
62+
contents: uncachedContent ?? contents,
3263
config: {
64+
cachedContent,
65+
systemInstruction: cachedContent ? undefined : systemInstruction,
3366
httpOptions: this.options.googleGeminiBaseUrl
3467
? { baseUrl: this.options.googleGeminiBaseUrl }
3568
: undefined,
3669
thinkingConfig,
3770
maxOutputTokens,
3871
temperature: this.options.modelTemperature ?? 0,
39-
systemInstruction: systemPrompt,
4072
},
4173
}
4274

@@ -55,10 +87,16 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
5587
}
5688

5789
if (lastUsageMetadata) {
90+
const inputTokens = lastUsageMetadata.promptTokenCount ?? 0
91+
const cachedInputTokens = lastUsageMetadata.cachedContentTokenCount ?? 0
92+
const outputTokens = lastUsageMetadata.candidatesTokenCount ?? 0
93+
5894
yield {
5995
type: "usage",
60-
inputTokens: lastUsageMetadata.promptTokenCount ?? 0,
61-
outputTokens: lastUsageMetadata.candidatesTokenCount ?? 0,
96+
inputTokens: inputTokens - cachedInputTokens,
97+
outputTokens,
98+
cacheWriteTokens,
99+
cacheReadTokens: cachedInputTokens,
62100
}
63101
}
64102
}
@@ -68,33 +106,35 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
68106
info: ModelInfo
69107
thinkingConfig?: ThinkingConfig
70108
maxOutputTokens?: number
109+
supportsPromptCache?: boolean
71110
} {
72111
let id = this.options.apiModelId ? (this.options.apiModelId as GeminiModelId) : geminiDefaultModelId
73112
let info: ModelInfo = geminiModels[id]
74-
let thinkingConfig: ThinkingConfig | undefined = undefined
75-
let maxOutputTokens: number | undefined = undefined
76113

77-
const thinkingSuffix = ":thinking"
114+
if (id?.endsWith(":thinking")) {
115+
id = id.slice(0, -":thinking".length) as GeminiModelId
78116

79-
if (id?.endsWith(thinkingSuffix)) {
80-
id = id.slice(0, -thinkingSuffix.length) as GeminiModelId
81-
info = geminiModels[id]
117+
if (geminiModels[id]) {
118+
info = geminiModels[id]
82119

83-
thinkingConfig = this.options.modelMaxThinkingTokens
84-
? { thinkingBudget: this.options.modelMaxThinkingTokens }
85-
: undefined
86-
87-
maxOutputTokens = this.options.modelMaxTokens ?? info.maxTokens ?? undefined
120+
return {
121+
id,
122+
info,
123+
thinkingConfig: this.options.modelMaxThinkingTokens
124+
? { thinkingBudget: this.options.modelMaxThinkingTokens }
125+
: undefined,
126+
maxOutputTokens: this.options.modelMaxTokens ?? info.maxTokens ?? undefined,
127+
supportsPromptCache: info.supportsPromptCache,
128+
}
129+
}
88130
}
89131

90132
if (!info) {
91133
id = geminiDefaultModelId
92134
info = geminiModels[geminiDefaultModelId]
93-
thinkingConfig = undefined
94-
maxOutputTokens = undefined
95135
}
96136

97-
return { id, info, thinkingConfig, maxOutputTokens }
137+
return { id, info, supportsPromptCache: info.supportsPromptCache }
98138
}
99139

100140
async completePrompt(prompt: string): Promise<string> {

src/core/Cline.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1075,7 +1075,7 @@ export class Cline extends EventEmitter<ClineEvents> {
10751075
return { role, content }
10761076
})
10771077

1078-
const stream = this.api.createMessage(systemPrompt, cleanConversationHistory)
1078+
const stream = this.api.createMessage(systemPrompt, cleanConversationHistory, this.taskId)
10791079
const iterator = stream[Symbol.asyncIterator]()
10801080

10811081
try {

src/shared/api.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -682,9 +682,13 @@ export const geminiModels = {
682682
maxTokens: 65_535,
683683
contextWindow: 1_048_576,
684684
supportsImages: true,
685-
supportsPromptCache: false,
685+
supportsPromptCache: true,
686686
inputPrice: 2.5,
687687
outputPrice: 15,
688+
// Context caching price:
689+
// $0.31, prompts <= 200k tokens
690+
// $0.625, prompts > 200k
691+
// $4.50 / 1,000,000 tokens per hour
688692
},
689693
"gemini-2.0-flash-001": {
690694
maxTokens: 8192,

0 commit comments

Comments
 (0)