From 92f39e3f0ab258158825f475717259da2b92f61d Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Thu, 24 Apr 2025 13:11:04 -0700
Subject: [PATCH 1/5] Gemini caching improvements

---
 src/api/providers/gemini.ts        | 96 +++++++++++++++++++++---------
 src/api/transform/gemini-format.ts |  6 ++
 src/shared/api.ts                  |  6 +-
 3 files changed, 78 insertions(+), 30 deletions(-)
diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts
index 43fae541379..803a4be9963 100644
--- a/src/api/providers/gemini.ts
+++ b/src/api/providers/gemini.ts
@@ -4,27 +4,38 @@ import {
 	type GenerateContentResponseUsageMetadata,
 	type GenerateContentParameters,
 	type Content,
+	CreateCachedContentConfig,
 } from "@google/genai"
+import NodeCache from "node-cache"
 
 import { SingleCompletionHandler } from "../"
 import type { ApiHandlerOptions, GeminiModelId, ModelInfo } from "../../shared/api"
 import { geminiDefaultModelId, geminiModels } from "../../shared/api"
-import { convertAnthropicContentToGemini, convertAnthropicMessageToGemini } from "../transform/gemini-format"
+import {
+	convertAnthropicContentToGemini,
+	convertAnthropicMessageToGemini,
+	getMessagesLength,
+} from "../transform/gemini-format"
 import type { ApiStream } from "../transform/stream"
 import { BaseProvider } from "./base-provider"
 
 const CACHE_TTL = 5
 
+type CacheEntry = {
+	key: string
+	count: number
+}
+
 export class GeminiHandler extends BaseProvider implements SingleCompletionHandler {
 	protected options: ApiHandlerOptions
 	private client: GoogleGenAI
-	private contentCaches: Map<string, { key: string; count: number }>
+	private contentCaches: NodeCache
 
 	constructor(options: ApiHandlerOptions) {
 		super()
 		this.options = options
 		this.client = new GoogleGenAI({ apiKey: options.geminiApiKey ?? "not-provided" })
-		this.contentCaches = new Map()
+		this.contentCaches = new NodeCache({ stdTTL: 5 * 60, checkperiod: 5 * 60 })
 	}
 
 	async *createMessage(
@@ -35,36 +46,65 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		const { id: model, thinkingConfig, maxOutputTokens, info } = this.getModel()
 
 		const contents = messages.map(convertAnthropicMessageToGemini)
+		const contentsLength = systemInstruction.length + getMessagesLength(contents)
+
 		let uncachedContent: Content[] | undefined = undefined
 		let cachedContent: string | undefined = undefined
 		let cacheWriteTokens: number | undefined = undefined
 
+		const isCacheAvailable =
+			info.supportsPromptCache && this.options.promptCachingEnabled && cacheKey && contentsLength > 16_384
+
+		console.log(`[GeminiHandler] isCacheAvailable=${isCacheAvailable}, contentsLength=${contentsLength}`)
+
 		// https://ai.google.dev/gemini-api/docs/caching?lang=node
-		// if (info.supportsPromptCache && cacheKey) {
-		// 	const cacheEntry = this.contentCaches.get(cacheKey)
+		if (isCacheAvailable) {
+			const cacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)
+
+			if (cacheEntry) {
+				uncachedContent = contents.slice(cacheEntry.count, contents.length)
+				cachedContent = cacheEntry.key
+				console.log(
+					`[GeminiHandler] using ${cacheEntry.count} cached messages (${cacheEntry.key}) and ${uncachedContent.length} uncached messages`,
+				)
+			}
 
-		// 	if (cacheEntry) {
-		// 		uncachedContent = contents.slice(cacheEntry.count, contents.length)
-		// 		cachedContent = cacheEntry.key
-		// 	}
+			const timestamp = Date.now()
 
-		// 	const newCacheEntry = await this.client.caches.create({
-		// 		model,
-		// 		config: { contents, systemInstruction, ttl: `${CACHE_TTL * 60}s` },
-		// 	})
+			const config: CreateCachedContentConfig = {
+				contents,
+				systemInstruction,
+				ttl: `${CACHE_TTL * 60}s`,
+				httpOptions: { timeout: 10_000 },
+			}
+
+			this.client.caches
+				.create({ model, config })
+				.then((result) => {
+					console.log(`[GeminiHandler] caches.create result -> ${JSON.stringify(result)}`)
+					const { name, usageMetadata } = result
+
+					if (name) {
+						this.contentCaches.set<CacheEntry>(cacheKey, { key: name, count: contents.length })
+						cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0
+						console.log(
+							`[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`,
+						)
+					}
+				})
+				.catch((error) => {
+					console.error(`[GeminiHandler] caches.create error`, error)
+				})
+		}
 
-		// 	if (newCacheEntry.name) {
-		// 		this.contentCaches.set(cacheKey, { key: newCacheEntry.name, count: contents.length })
-		// 		cacheWriteTokens = newCacheEntry.usageMetadata?.totalTokenCount ?? 0
-		// 	}
-		// }
+		const isCacheUsed = !!cachedContent
 
 		const params: GenerateContentParameters = {
 			model,
 			contents: uncachedContent ?? contents,
 			config: {
 				cachedContent,
-				systemInstruction: cachedContent ? undefined : systemInstruction,
+				systemInstruction: isCacheUsed ? undefined : systemInstruction,
 				httpOptions: this.options.googleGeminiBaseUrl
 					? { baseUrl: this.options.googleGeminiBaseUrl }
 					: undefined,
@@ -94,13 +134,15 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 			const cacheReadTokens = lastUsageMetadata.cachedContentTokenCount
 			const reasoningTokens = lastUsageMetadata.thoughtsTokenCount
 
-			// const totalCost = this.calculateCost({
-			// 	info,
-			// 	inputTokens,
-			// 	outputTokens,
-			// 	cacheWriteTokens,
-			// 	cacheReadTokens,
-			// })
+			const totalCost = isCacheUsed
+				? this.calculateCost({
+						info,
+						inputTokens,
+						outputTokens,
+						cacheWriteTokens,
+						cacheReadTokens,
+					})
+				: undefined
 
 			yield {
 				type: "usage",
@@ -109,7 +151,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 				cacheWriteTokens,
 				cacheReadTokens,
 				reasoningTokens,
-				// totalCost,
+				totalCost,
 			}
 		}
 	}
diff --git a/src/api/transform/gemini-format.ts b/src/api/transform/gemini-format.ts
index ee22cff32a4..be08d7ff7ba 100644
--- a/src/api/transform/gemini-format.ts
+++ b/src/api/transform/gemini-format.ts
@@ -76,3 +76,9 @@ export function convertAnthropicMessageToGemini(message: Anthropic.Messages.Mess
 		parts: convertAnthropicContentToGemini(message.content),
 	}
 }
+
+const getContentLength = ({ parts }: Content): number =>
+	parts?.reduce((length, { text }) => length + (text?.length ?? 0), 0) ?? 0
+
+export const getMessagesLength = (contents: Content[]): number =>
+	contents.reduce((length, content) => length + getContentLength(content), 0)
diff --git a/src/shared/api.ts b/src/shared/api.ts
index 25d4a668526..5dbed396a08 100644
--- a/src/shared/api.ts
+++ b/src/shared/api.ts
@@ -679,7 +679,7 @@ export const geminiModels = {
 		maxTokens: 65_535,
 		contextWindow: 1_048_576,
 		supportsImages: true,
-		supportsPromptCache: false,
+		supportsPromptCache: true,
 		isPromptCacheOptional: true,
 		inputPrice: 2.5, // This is the pricing for prompts above 200k tokens.
 		outputPrice: 15,
@@ -704,7 +704,7 @@ export const geminiModels = {
 		maxTokens: 8192,
 		contextWindow: 1_048_576,
 		supportsImages: true,
-		supportsPromptCache: false,
+		supportsPromptCache: true,
 		isPromptCacheOptional: true,
 		inputPrice: 0.1,
 		outputPrice: 0.4,
@@ -755,7 +755,7 @@ export const geminiModels = {
 		maxTokens: 8192,
 		contextWindow: 1_048_576,
 		supportsImages: true,
-		supportsPromptCache: false,
+		supportsPromptCache: true,
 		isPromptCacheOptional: true,
 		inputPrice: 0.15, // This is the pricing for prompts above 128k tokens.
 		outputPrice: 0.6,

From e5ac743a65acfc563647fe7320b7a17a03644cb5 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Thu, 24 Apr 2025 13:54:26 -0700
Subject: [PATCH 2/5] Add changeset

---
 .changeset/bright-singers-drop.md | 5 +++++
 src/api/providers/gemini.ts       | 1 +
 2 files changed, 6 insertions(+)
 create mode 100644 .changeset/bright-singers-drop.md

diff --git a/.changeset/bright-singers-drop.md b/.changeset/bright-singers-drop.md
new file mode 100644
index 00000000000..f76b8001b84
--- /dev/null
+++ b/.changeset/bright-singers-drop.md
@@ -0,0 +1,5 @@
+---
+"roo-cline": patch
+---
+
+Enable prompt caching for Gemini (with some improvements)
diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts
index 803a4be9963..878782acbad 100644
--- a/src/api/providers/gemini.ts
+++ b/src/api/providers/gemini.ts
@@ -46,6 +46,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		const { id: model, thinkingConfig, maxOutputTokens, info } = this.getModel()
 
 		const contents = messages.map(convertAnthropicMessageToGemini)
+		// This is just an approximation for now; we can use tiktoken eventually.
 		const contentsLength = systemInstruction.length + getMessagesLength(contents)
 
 		let uncachedContent: Content[] | undefined = undefined

From 9570b813c5891e4a818011959d4cabd1a507f891 Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Thu, 24 Apr 2025 14:00:07 -0700
Subject: [PATCH 3/5] PR feedback

---
 src/api/providers/gemini.ts | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts
index 878782acbad..0887965cf98 100644
--- a/src/api/providers/gemini.ts
+++ b/src/api/providers/gemini.ts
@@ -21,6 +21,8 @@ import { BaseProvider } from "./base-provider"
 
 const CACHE_TTL = 5
 
+const CONTEXT_CACHE_TOKEN_MINIMUM = 4096
+
 type CacheEntry = {
 	key: string
 	count: number
@@ -46,19 +48,25 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		const { id: model, thinkingConfig, maxOutputTokens, info } = this.getModel()
 
 		const contents = messages.map(convertAnthropicMessageToGemini)
-		// This is just an approximation for now; we can use tiktoken eventually.
 		const contentsLength = systemInstruction.length + getMessagesLength(contents)
 
 		let uncachedContent: Content[] | undefined = undefined
 		let cachedContent: string | undefined = undefined
 		let cacheWriteTokens: number | undefined = undefined
 
+		// The minimum input token count for context caching is 4,096.
+		// For a basic appoximation we assume 4 characters per token.
+		// We can use tiktoken eventually to get a more accurat token count.
+		// https://ai.google.dev/gemini-api/docs/caching?lang=node
+		// https://ai.google.dev/gemini-api/docs/tokens?lang=node
 		const isCacheAvailable =
-			info.supportsPromptCache && this.options.promptCachingEnabled && cacheKey && contentsLength > 16_384
+			info.supportsPromptCache &&
+			this.options.promptCachingEnabled &&
+			cacheKey &&
+			contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM
 
 		console.log(`[GeminiHandler] isCacheAvailable=${isCacheAvailable}, contentsLength=${contentsLength}`)
 
-		// https://ai.google.dev/gemini-api/docs/caching?lang=node
 		if (isCacheAvailable) {
 			const cacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)
 

From dfd7bb3926c7d081e5ede1e33019033cc1e7a9ac Mon Sep 17 00:00:00 2001
From: cte <cestreich@gmail.com>
Date: Thu, 24 Apr 2025 14:18:48 -0700
Subject: [PATCH 4/5] Add an isCacheBusy flag

---
 src/api/providers/gemini.ts | 60 ++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts
index 0887965cf98..71b3ae33dc5 100644
--- a/src/api/providers/gemini.ts
+++ b/src/api/providers/gemini.ts
@@ -30,8 +30,10 @@ type CacheEntry = {
 
 export class GeminiHandler extends BaseProvider implements SingleCompletionHandler {
 	protected options: ApiHandlerOptions
+
 	private client: GoogleGenAI
 	private contentCaches: NodeCache
+	private isCacheBusy = false
 
 	constructor(options: ApiHandlerOptions) {
 		super()
@@ -65,8 +67,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 			cacheKey &&
 			contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM
 
-		console.log(`[GeminiHandler] isCacheAvailable=${isCacheAvailable}, contentsLength=${contentsLength}`)
-
 		if (isCacheAvailable) {
 			const cacheEntry = this.contentCaches.get<CacheEntry>(cacheKey)
 
@@ -78,32 +78,38 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 				)
 			}
 
-			const timestamp = Date.now()
-
-			const config: CreateCachedContentConfig = {
-				contents,
-				systemInstruction,
-				ttl: `${CACHE_TTL * 60}s`,
-				httpOptions: { timeout: 10_000 },
+			if (!this.isCacheBusy) {
+				this.isCacheBusy = true
+				const timestamp = Date.now()
+
+				this.client.caches
+					.create({
+						model,
+						config: {
+							contents,
+							systemInstruction,
+							ttl: `${CACHE_TTL * 60}s`,
+							httpOptions: { timeout: 120_000 },
+						},
+					})
+					.then((result) => {
+						const { name, usageMetadata } = result
+
+						if (name) {
+							this.contentCaches.set<CacheEntry>(cacheKey, { key: name, count: contents.length })
+							cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0
+							console.log(
+								`[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`,
+							)
+						}
+					})
+					.catch((error) => {
+						console.error(`[GeminiHandler] caches.create error`, error)
+					})
+					.finally(() => {
+						this.isCacheBusy = false
+					})
 			}
-
-			this.client.caches
-				.create({ model, config })
-				.then((result) => {
-					console.log(`[GeminiHandler] caches.create result -> ${JSON.stringify(result)}`)
-					const { name, usageMetadata } = result
-
-					if (name) {
-						this.contentCaches.set<CacheEntry>(cacheKey, { key: name, count: contents.length })
-						cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0
-						console.log(
-							`[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`,
-						)
-					}
-				})
-				.catch((error) => {
-					console.error(`[GeminiHandler] caches.create error`, error)
-				})
 		}
 
 		const isCacheUsed = !!cachedContent

From c25c2f16e6e8d32d8f2ac3775e9a52062534de1f Mon Sep 17 00:00:00 2001
From: Chris Estreich <cestreich@gmail.com>
Date: Thu, 24 Apr 2025 14:19:46 -0700
Subject: [PATCH 5/5] Update src/api/providers/gemini.ts

Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com>
---
 src/api/providers/gemini.ts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts
index 71b3ae33dc5..ecc493dcd95 100644
--- a/src/api/providers/gemini.ts
+++ b/src/api/providers/gemini.ts
@@ -57,7 +57,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 		let cacheWriteTokens: number | undefined = undefined
 
 		// The minimum input token count for context caching is 4,096.
-		// For a basic appoximation we assume 4 characters per token.
+		// For a basic approximation we assume 4 characters per token.
 		// We can use tiktoken eventually to get a more accurat token count.
 		// https://ai.google.dev/gemini-api/docs/caching?lang=node
 		// https://ai.google.dev/gemini-api/docs/tokens?lang=node