From 92f39e3f0ab258158825f475717259da2b92f61d Mon Sep 17 00:00:00 2001 From: cte Date: Thu, 24 Apr 2025 13:11:04 -0700 Subject: [PATCH 1/5] Gemini caching improvements --- src/api/providers/gemini.ts | 96 +++++++++++++++++++++--------- src/api/transform/gemini-format.ts | 6 ++ src/shared/api.ts | 6 +- 3 files changed, 78 insertions(+), 30 deletions(-) diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts index 43fae541379..803a4be9963 100644 --- a/src/api/providers/gemini.ts +++ b/src/api/providers/gemini.ts @@ -4,27 +4,38 @@ import { type GenerateContentResponseUsageMetadata, type GenerateContentParameters, type Content, + CreateCachedContentConfig, } from "@google/genai" +import NodeCache from "node-cache" import { SingleCompletionHandler } from "../" import type { ApiHandlerOptions, GeminiModelId, ModelInfo } from "../../shared/api" import { geminiDefaultModelId, geminiModels } from "../../shared/api" -import { convertAnthropicContentToGemini, convertAnthropicMessageToGemini } from "../transform/gemini-format" +import { + convertAnthropicContentToGemini, + convertAnthropicMessageToGemini, + getMessagesLength, +} from "../transform/gemini-format" import type { ApiStream } from "../transform/stream" import { BaseProvider } from "./base-provider" const CACHE_TTL = 5 +type CacheEntry = { + key: string + count: number +} + export class GeminiHandler extends BaseProvider implements SingleCompletionHandler { protected options: ApiHandlerOptions private client: GoogleGenAI - private contentCaches: Map + private contentCaches: NodeCache constructor(options: ApiHandlerOptions) { super() this.options = options this.client = new GoogleGenAI({ apiKey: options.geminiApiKey ?? "not-provided" }) - this.contentCaches = new Map() + this.contentCaches = new NodeCache({ stdTTL: 5 * 60, checkperiod: 5 * 60 }) } async *createMessage( @@ -35,36 +46,65 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl const { id: model, thinkingConfig, maxOutputTokens, info } = this.getModel() const contents = messages.map(convertAnthropicMessageToGemini) + const contentsLength = systemInstruction.length + getMessagesLength(contents) + let uncachedContent: Content[] | undefined = undefined let cachedContent: string | undefined = undefined let cacheWriteTokens: number | undefined = undefined + const isCacheAvailable = + info.supportsPromptCache && this.options.promptCachingEnabled && cacheKey && contentsLength > 16_384 + + console.log(`[GeminiHandler] isCacheAvailable=${isCacheAvailable}, contentsLength=${contentsLength}`) + // https://ai.google.dev/gemini-api/docs/caching?lang=node - // if (info.supportsPromptCache && cacheKey) { - // const cacheEntry = this.contentCaches.get(cacheKey) + if (isCacheAvailable) { + const cacheEntry = this.contentCaches.get(cacheKey) + + if (cacheEntry) { + uncachedContent = contents.slice(cacheEntry.count, contents.length) + cachedContent = cacheEntry.key + console.log( + `[GeminiHandler] using ${cacheEntry.count} cached messages (${cacheEntry.key}) and ${uncachedContent.length} uncached messages`, + ) + } - // if (cacheEntry) { - // uncachedContent = contents.slice(cacheEntry.count, contents.length) - // cachedContent = cacheEntry.key - // } + const timestamp = Date.now() - // const newCacheEntry = await this.client.caches.create({ - // model, - // config: { contents, systemInstruction, ttl: `${CACHE_TTL * 60}s` }, - // }) + const config: CreateCachedContentConfig = { + contents, + systemInstruction, + ttl: `${CACHE_TTL * 60}s`, + httpOptions: { timeout: 10_000 }, + } + + this.client.caches + .create({ model, config }) + .then((result) => { + console.log(`[GeminiHandler] caches.create result -> ${JSON.stringify(result)}`) + const { name, usageMetadata } = result + + if (name) { + this.contentCaches.set(cacheKey, { key: name, count: contents.length }) + cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0 + console.log( + `[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`, + ) + } + }) + .catch((error) => { + console.error(`[GeminiHandler] caches.create error`, error) + }) + } - // if (newCacheEntry.name) { - // this.contentCaches.set(cacheKey, { key: newCacheEntry.name, count: contents.length }) - // cacheWriteTokens = newCacheEntry.usageMetadata?.totalTokenCount ?? 0 - // } - // } + const isCacheUsed = !!cachedContent const params: GenerateContentParameters = { model, contents: uncachedContent ?? contents, config: { cachedContent, - systemInstruction: cachedContent ? undefined : systemInstruction, + systemInstruction: isCacheUsed ? undefined : systemInstruction, httpOptions: this.options.googleGeminiBaseUrl ? { baseUrl: this.options.googleGeminiBaseUrl } : undefined, @@ -94,13 +134,15 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl const cacheReadTokens = lastUsageMetadata.cachedContentTokenCount const reasoningTokens = lastUsageMetadata.thoughtsTokenCount - // const totalCost = this.calculateCost({ - // info, - // inputTokens, - // outputTokens, - // cacheWriteTokens, - // cacheReadTokens, - // }) + const totalCost = isCacheUsed + ? this.calculateCost({ + info, + inputTokens, + outputTokens, + cacheWriteTokens, + cacheReadTokens, + }) + : undefined yield { type: "usage", @@ -109,7 +151,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl cacheWriteTokens, cacheReadTokens, reasoningTokens, - // totalCost, + totalCost, } } } diff --git a/src/api/transform/gemini-format.ts b/src/api/transform/gemini-format.ts index ee22cff32a4..be08d7ff7ba 100644 --- a/src/api/transform/gemini-format.ts +++ b/src/api/transform/gemini-format.ts @@ -76,3 +76,9 @@ export function convertAnthropicMessageToGemini(message: Anthropic.Messages.Mess parts: convertAnthropicContentToGemini(message.content), } } + +const getContentLength = ({ parts }: Content): number => + parts?.reduce((length, { text }) => length + (text?.length ?? 0), 0) ?? 0 + +export const getMessagesLength = (contents: Content[]): number => + contents.reduce((length, content) => length + getContentLength(content), 0) diff --git a/src/shared/api.ts b/src/shared/api.ts index 25d4a668526..5dbed396a08 100644 --- a/src/shared/api.ts +++ b/src/shared/api.ts @@ -679,7 +679,7 @@ export const geminiModels = { maxTokens: 65_535, contextWindow: 1_048_576, supportsImages: true, - supportsPromptCache: false, + supportsPromptCache: true, isPromptCacheOptional: true, inputPrice: 2.5, // This is the pricing for prompts above 200k tokens. outputPrice: 15, @@ -704,7 +704,7 @@ export const geminiModels = { maxTokens: 8192, contextWindow: 1_048_576, supportsImages: true, - supportsPromptCache: false, + supportsPromptCache: true, isPromptCacheOptional: true, inputPrice: 0.1, outputPrice: 0.4, @@ -755,7 +755,7 @@ export const geminiModels = { maxTokens: 8192, contextWindow: 1_048_576, supportsImages: true, - supportsPromptCache: false, + supportsPromptCache: true, isPromptCacheOptional: true, inputPrice: 0.15, // This is the pricing for prompts above 128k tokens. outputPrice: 0.6, From e5ac743a65acfc563647fe7320b7a17a03644cb5 Mon Sep 17 00:00:00 2001 From: cte Date: Thu, 24 Apr 2025 13:54:26 -0700 Subject: [PATCH 2/5] Add changeset --- .changeset/bright-singers-drop.md | 5 +++++ src/api/providers/gemini.ts | 1 + 2 files changed, 6 insertions(+) create mode 100644 .changeset/bright-singers-drop.md diff --git a/.changeset/bright-singers-drop.md b/.changeset/bright-singers-drop.md new file mode 100644 index 00000000000..f76b8001b84 --- /dev/null +++ b/.changeset/bright-singers-drop.md @@ -0,0 +1,5 @@ +--- +"roo-cline": patch +--- + +Enable prompt caching for Gemini (with some improvements) diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts index 803a4be9963..878782acbad 100644 --- a/src/api/providers/gemini.ts +++ b/src/api/providers/gemini.ts @@ -46,6 +46,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl const { id: model, thinkingConfig, maxOutputTokens, info } = this.getModel() const contents = messages.map(convertAnthropicMessageToGemini) + // This is just an approximation for now; we can use tiktoken eventually. const contentsLength = systemInstruction.length + getMessagesLength(contents) let uncachedContent: Content[] | undefined = undefined From 9570b813c5891e4a818011959d4cabd1a507f891 Mon Sep 17 00:00:00 2001 From: cte Date: Thu, 24 Apr 2025 14:00:07 -0700 Subject: [PATCH 3/5] PR feedback --- src/api/providers/gemini.ts | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts index 878782acbad..0887965cf98 100644 --- a/src/api/providers/gemini.ts +++ b/src/api/providers/gemini.ts @@ -21,6 +21,8 @@ import { BaseProvider } from "./base-provider" const CACHE_TTL = 5 +const CONTEXT_CACHE_TOKEN_MINIMUM = 4096 + type CacheEntry = { key: string count: number @@ -46,19 +48,25 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl const { id: model, thinkingConfig, maxOutputTokens, info } = this.getModel() const contents = messages.map(convertAnthropicMessageToGemini) - // This is just an approximation for now; we can use tiktoken eventually. const contentsLength = systemInstruction.length + getMessagesLength(contents) let uncachedContent: Content[] | undefined = undefined let cachedContent: string | undefined = undefined let cacheWriteTokens: number | undefined = undefined + // The minimum input token count for context caching is 4,096. + // For a basic appoximation we assume 4 characters per token. + // We can use tiktoken eventually to get a more accurat token count. + // https://ai.google.dev/gemini-api/docs/caching?lang=node + // https://ai.google.dev/gemini-api/docs/tokens?lang=node const isCacheAvailable = - info.supportsPromptCache && this.options.promptCachingEnabled && cacheKey && contentsLength > 16_384 + info.supportsPromptCache && + this.options.promptCachingEnabled && + cacheKey && + contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM console.log(`[GeminiHandler] isCacheAvailable=${isCacheAvailable}, contentsLength=${contentsLength}`) - // https://ai.google.dev/gemini-api/docs/caching?lang=node if (isCacheAvailable) { const cacheEntry = this.contentCaches.get(cacheKey) From dfd7bb3926c7d081e5ede1e33019033cc1e7a9ac Mon Sep 17 00:00:00 2001 From: cte Date: Thu, 24 Apr 2025 14:18:48 -0700 Subject: [PATCH 4/5] Add an isCacheBusy flag --- src/api/providers/gemini.ts | 60 ++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts index 0887965cf98..71b3ae33dc5 100644 --- a/src/api/providers/gemini.ts +++ b/src/api/providers/gemini.ts @@ -30,8 +30,10 @@ type CacheEntry = { export class GeminiHandler extends BaseProvider implements SingleCompletionHandler { protected options: ApiHandlerOptions + private client: GoogleGenAI private contentCaches: NodeCache + private isCacheBusy = false constructor(options: ApiHandlerOptions) { super() @@ -65,8 +67,6 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl cacheKey && contentsLength > 4 * CONTEXT_CACHE_TOKEN_MINIMUM - console.log(`[GeminiHandler] isCacheAvailable=${isCacheAvailable}, contentsLength=${contentsLength}`) - if (isCacheAvailable) { const cacheEntry = this.contentCaches.get(cacheKey) @@ -78,32 +78,38 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl ) } - const timestamp = Date.now() - - const config: CreateCachedContentConfig = { - contents, - systemInstruction, - ttl: `${CACHE_TTL * 60}s`, - httpOptions: { timeout: 10_000 }, + if (!this.isCacheBusy) { + this.isCacheBusy = true + const timestamp = Date.now() + + this.client.caches + .create({ + model, + config: { + contents, + systemInstruction, + ttl: `${CACHE_TTL * 60}s`, + httpOptions: { timeout: 120_000 }, + }, + }) + .then((result) => { + const { name, usageMetadata } = result + + if (name) { + this.contentCaches.set(cacheKey, { key: name, count: contents.length }) + cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0 + console.log( + `[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`, + ) + } + }) + .catch((error) => { + console.error(`[GeminiHandler] caches.create error`, error) + }) + .finally(() => { + this.isCacheBusy = false + }) } - - this.client.caches - .create({ model, config }) - .then((result) => { - console.log(`[GeminiHandler] caches.create result -> ${JSON.stringify(result)}`) - const { name, usageMetadata } = result - - if (name) { - this.contentCaches.set(cacheKey, { key: name, count: contents.length }) - cacheWriteTokens = usageMetadata?.totalTokenCount ?? 0 - console.log( - `[GeminiHandler] cached ${contents.length} messages (${cacheWriteTokens} tokens) in ${Date.now() - timestamp}ms`, - ) - } - }) - .catch((error) => { - console.error(`[GeminiHandler] caches.create error`, error) - }) } const isCacheUsed = !!cachedContent From c25c2f16e6e8d32d8f2ac3775e9a52062534de1f Mon Sep 17 00:00:00 2001 From: Chris Estreich Date: Thu, 24 Apr 2025 14:19:46 -0700 Subject: [PATCH 5/5] Update src/api/providers/gemini.ts Co-authored-by: ellipsis-dev[bot] <65095814+ellipsis-dev[bot]@users.noreply.github.com> --- src/api/providers/gemini.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts index 71b3ae33dc5..ecc493dcd95 100644 --- a/src/api/providers/gemini.ts +++ b/src/api/providers/gemini.ts @@ -57,7 +57,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl let cacheWriteTokens: number | undefined = undefined // The minimum input token count for context caching is 4,096. - // For a basic appoximation we assume 4 characters per token. + // For a basic approximation we assume 4 characters per token. // We can use tiktoken eventually to get a more accurat token count. // https://ai.google.dev/gemini-api/docs/caching?lang=node // https://ai.google.dev/gemini-api/docs/tokens?lang=node