Gemini prompt caching

cte · cte · commit 2359aa44602a · 2025-04-21T16:11:23.000-07:00
diff --git a/src/api/index.ts b/src/api/index.ts
@@ -28,7 +28,8 @@ export interface SingleCompletionHandler {
 }
 
 export interface ApiHandler {
-	createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream
+	createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[], taskId?: string): ApiStream
+
 	getModel(): { id: string; info: ModelInfo }
 
 	/**
diff --git a/src/api/providers/anthropic.ts b/src/api/providers/anthropic.ts
@@ -42,8 +42,14 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
 			case "claude-3-opus-20240229":
 			case "claude-3-haiku-20240307": {
 				/**
-				 * The latest message will be the new user message, one before will
-				 * be the assistant message from a previous request, and the user message before that will be a previously cached user message. So we need to mark the latest user message as ephemeral to cache it for the next request, and mark the second to last user message as ephemeral to let the server know the last message to retrieve from the cache for the current request..
+				 * The latest message will be the new user message, one before
+				 * will be the assistant message from a previous request, and
+				 * the user message before that will be a previously cached user
+				 * message. So we need to mark the latest user message as
+				 * ephemeral to cache it for the next request, and mark the
+				 * second to last user message as ephemeral to let the server
+				 * know the last message to retrieve from the cache for the
+				 * current request.
 				 */
 				const userMsgIndices = messages.reduce(
 					(acc, msg, index) => (msg.role === "user" ? [...acc, index] : acc),
@@ -77,9 +83,6 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
 							}
 							return message
 						}),
-						// tools, // cache breakpoints go from tools > system > messages, and since tools dont change, we can just set the breakpoint at the end of system (this avoids having to set a breakpoint at the end of tools which by itself does not meet min requirements for haiku caching)
-						// tool_choice: { type: "auto" },
-						// tools: tools,
 						stream: true,
 					},
 					(() => {
@@ -102,9 +105,7 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
 							case "claude-3-opus-20240229":
 							case "claude-3-haiku-20240307":
 								betas.push("prompt-caching-2024-07-31")
-								return {
-									headers: { "anthropic-beta": betas.join(",") },
-								}
+								return { headers: { "anthropic-beta": betas.join(",") } }
 							default:
 								return undefined
 						}
@@ -119,8 +120,6 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa
 					temperature,
 					system: [{ text: systemPrompt, type: "text" }],
 					messages,
-					// tools,
-					// tool_choice: { type: "auto" },
 					stream: true,
 				})) as any
 				break
diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts
@@ -4,6 +4,7 @@ import {
 	ThinkingConfig,
 	type GenerateContentResponseUsageMetadata,
 	type GenerateContentParameters,
+	type Content,
 } from "@google/genai"
 
 import { SingleCompletionHandler } from "../"
@@ -16,27 +17,58 @@ import { BaseProvider } from "./base-provider"
 export class GeminiHandler extends BaseProvider implements SingleCompletionHandler {
 	protected options: ApiHandlerOptions
 	private client: GoogleGenAI
+	private contentCaches: Map<string, string>
 
 	constructor(options: ApiHandlerOptions) {
 		super()
 		this.options = options
 		this.client = new GoogleGenAI({ apiKey: options.geminiApiKey ?? "not-provided" })
+		this.contentCaches = new Map()
 	}
 
-	async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream {
-		const { id: model, thinkingConfig, maxOutputTokens } = this.getModel()
+	async *createMessage(
+		systemInstruction: string,
+		messages: Anthropic.Messages.MessageParam[],
+		taskId?: string,
+	): ApiStream {
+		const { id: model, thinkingConfig, maxOutputTokens, supportsPromptCache } = this.getModel()
+
+		const contents = messages.map(convertAnthropicMessageToGemini)
+		let uncachedContent: Content | undefined = undefined
+		let cachedContent: string | undefined = undefined
+		let cacheWriteTokens: number = 0
+
+		// https://ai.google.dev/gemini-api/docs/caching?lang=node
+		if (supportsPromptCache && taskId) {
+			cachedContent = this.contentCaches.get(taskId)
+
+			if (cachedContent) {
+				uncachedContent = convertAnthropicMessageToGemini(messages[messages.length - 1])
+			}
+
+			const updatedCachedContent = await this.client.caches.create({
+				model,
+				config: { contents, systemInstruction, ttl: "300s" },
+			})
+
+			if (updatedCachedContent.name) {
+				this.contentCaches.set(taskId, updatedCachedContent.name)
+				cacheWriteTokens = updatedCachedContent.usageMetadata?.totalTokenCount ?? 0
+			}
+		}
 
 		const params: GenerateContentParameters = {
 			model,
-			contents: messages.map(convertAnthropicMessageToGemini),
+			contents: uncachedContent ?? contents,
 			config: {
+				cachedContent,
+				systemInstruction: cachedContent ? undefined : systemInstruction,
 				httpOptions: this.options.googleGeminiBaseUrl
 					? { baseUrl: this.options.googleGeminiBaseUrl }
 					: undefined,
 				thinkingConfig,
 				maxOutputTokens,
 				temperature: this.options.modelTemperature ?? 0,
-				systemInstruction: systemPrompt,
 			},
 		}
 
@@ -55,10 +87,16 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		}
 
 		if (lastUsageMetadata) {
+			const inputTokens = lastUsageMetadata.promptTokenCount ?? 0
+			const cachedInputTokens = lastUsageMetadata.cachedContentTokenCount ?? 0
+			const outputTokens = lastUsageMetadata.candidatesTokenCount ?? 0
+
 			yield {
 				type: "usage",
-				inputTokens: lastUsageMetadata.promptTokenCount ?? 0,
-				outputTokens: lastUsageMetadata.candidatesTokenCount ?? 0,
+				inputTokens: inputTokens - cachedInputTokens,
+				outputTokens,
+				cacheWriteTokens,
+				cacheReadTokens: cachedInputTokens,
 			}
 		}
 	}
@@ -68,33 +106,35 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		info: ModelInfo
 		thinkingConfig?: ThinkingConfig
 		maxOutputTokens?: number
+		supportsPromptCache?: boolean
 	} {
 		let id = this.options.apiModelId ? (this.options.apiModelId as GeminiModelId) : geminiDefaultModelId
 		let info: ModelInfo = geminiModels[id]
-		let thinkingConfig: ThinkingConfig | undefined = undefined
-		let maxOutputTokens: number | undefined = undefined
 
-		const thinkingSuffix = ":thinking"
+		if (id?.endsWith(":thinking")) {
+			id = id.slice(0, -":thinking".length) as GeminiModelId
 
-		if (id?.endsWith(thinkingSuffix)) {
-			id = id.slice(0, -thinkingSuffix.length) as GeminiModelId
-			info = geminiModels[id]
+			if (geminiModels[id]) {
+				info = geminiModels[id]
 
-			thinkingConfig = this.options.modelMaxThinkingTokens
-				? { thinkingBudget: this.options.modelMaxThinkingTokens }
-				: undefined
-
-			maxOutputTokens = this.options.modelMaxTokens ?? info.maxTokens ?? undefined
+				return {
+					id,
+					info,
+					thinkingConfig: this.options.modelMaxThinkingTokens
+						? { thinkingBudget: this.options.modelMaxThinkingTokens }
+						: undefined,
+					maxOutputTokens: this.options.modelMaxTokens ?? info.maxTokens ?? undefined,
+					supportsPromptCache: info.supportsPromptCache,
+				}
+			}
 		}
 
 		if (!info) {
 			id = geminiDefaultModelId
 			info = geminiModels[geminiDefaultModelId]
-			thinkingConfig = undefined
-			maxOutputTokens = undefined
 		}
 
-		return { id, info, thinkingConfig, maxOutputTokens }
+		return { id, info, supportsPromptCache: info.supportsPromptCache }
 	}
 
 	async completePrompt(prompt: string): Promise<string> {
diff --git a/src/core/Cline.ts b/src/core/Cline.ts
@@ -1075,7 +1075,7 @@ export class Cline extends EventEmitter<ClineEvents> {
 			return { role, content }
 		})
 
-		const stream = this.api.createMessage(systemPrompt, cleanConversationHistory)
+		const stream = this.api.createMessage(systemPrompt, cleanConversationHistory, this.taskId)
 		const iterator = stream[Symbol.asyncIterator]()
 
 		try {
diff --git a/src/shared/api.ts b/src/shared/api.ts
@@ -682,9 +682,13 @@ export const geminiModels = {
 		maxTokens: 65_535,
 		contextWindow: 1_048_576,
 		supportsImages: true,
-		supportsPromptCache: false,
+		supportsPromptCache: true,
 		inputPrice: 2.5,
 		outputPrice: 15,
+		// Context caching price:
+		// $0.31, prompts <= 200k tokens
+		// $0.625, prompts > 200k
+		// $4.50 / 1,000,000 tokens per hour
 	},
 	"gemini-2.0-flash-001": {
 		maxTokens: 8192,

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,8 @@ export interface SingleCompletionHandler {`
`28`	`28`	`}`
`29`	`29`
`30`	`30`	`export interface ApiHandler {`
`31`		`- createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream`
	`31`	`+ createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[], taskId?: string): ApiStream`
	`32`	`+`
`32`	`33`	`getModel(): { id: string; info: ModelInfo }`
`33`	`34`
`34`	`35`	`/**`