feat: allow enabling prompt caching for LiteLLM + Claude (RooCodeInc#2627)

sammcj · web-flow · commit 801946f5ea6d · 2025-04-07T16:45:37.000-07:00
* feat: allow enabling prompt caching for LiteLLM + Claude
diff --git a/.changeset/selfish-baboons-tickle.md b/.changeset/selfish-baboons-tickle.md
@@ -0,0 +1,5 @@
+---
+"claude-dev": patch
+---
+
+allow enabling prompt caching for LiteLLM + Claude
diff --git a/src/api/providers/litellm.ts b/src/api/providers/litellm.ts
@@ -71,9 +71,37 @@ export class LiteLlmHandler implements ApiHandler {
 			temperature = undefined // Thinking mode doesn't support temperature
 		}
 
+		// Define cache control object if prompt caching is enabled
+		const cacheControl = this.options.liteLlmUsePromptCache ? { cache_control: { type: "ephemeral" } } : undefined
+
+		// Add cache_control to system message if enabled
+		const enhancedSystemMessage = {
+			...systemMessage,
+			...(cacheControl && cacheControl),
+		}
+
+		// Find the last two user messages to apply caching
+		const userMsgIndices = formattedMessages.reduce(
+			(acc, msg, index) => (msg.role === "user" ? [...acc, index] : acc),
+			[] as number[],
+		)
+		const lastUserMsgIndex = userMsgIndices[userMsgIndices.length - 1] ?? -1
+		const secondLastUserMsgIndex = userMsgIndices[userMsgIndices.length - 2] ?? -1
+
+		// Apply cache_control to the last two user messages if enabled
+		const enhancedMessages = formattedMessages.map((message, index) => {
+			if ((index === lastUserMsgIndex || index === secondLastUserMsgIndex) && cacheControl) {
+				return {
+					...message,
+					...cacheControl,
+				}
+			}
+			return message
+		})
+
 		const stream = await this.client.chat.completions.create({
 			model: this.options.liteLlmModelId || liteLlmDefaultModelId,
-			messages: [systemMessage, ...formattedMessages],
+			messages: [enhancedSystemMessage, ...enhancedMessages],
 			temperature,
 			stream: true,
 			stream_options: { include_usage: true },
@@ -111,10 +139,27 @@ export class LiteLlmHandler implements ApiHandler {
 			if (chunk.usage) {
 				const totalCost =
 					(inputCost * chunk.usage.prompt_tokens) / 1e6 + (outputCost * chunk.usage.completion_tokens) / 1e6
+
+				// Extract cache-related information if available
+				// Need to use type assertion since these properties are not in the standard OpenAI types
+				const usage = chunk.usage as {
+					prompt_tokens: number
+					completion_tokens: number
+					cache_creation_input_tokens?: number
+					prompt_cache_miss_tokens?: number
+					cache_read_input_tokens?: number
+					prompt_cache_hit_tokens?: number
+				}
+
+				const cacheWriteTokens = usage.cache_creation_input_tokens || usage.prompt_cache_miss_tokens || 0
+				const cacheReadTokens = usage.cache_read_input_tokens || usage.prompt_cache_hit_tokens || 0
+
 				yield {
 					type: "usage",
-					inputTokens: chunk.usage.prompt_tokens || 0,
-					outputTokens: chunk.usage.completion_tokens || 0,
+					inputTokens: usage.prompt_tokens || 0,
+					outputTokens: usage.completion_tokens || 0,
+					cacheWriteTokens: cacheWriteTokens > 0 ? cacheWriteTokens : undefined,
+					cacheReadTokens: cacheReadTokens > 0 ? cacheReadTokens : undefined,
 					totalCost,
 				}
 			}
diff --git a/src/core/storage/state-keys.ts b/src/core/storage/state-keys.ts
@@ -58,6 +58,7 @@ export type GlobalStateKey =
 	| "previousModeModelInfo"
 	| "liteLlmBaseUrl"
 	| "liteLlmModelId"
+	| "liteLlmUsePromptCache"
 	| "qwenApiLine"
 	| "requestyModelId"
 	| "togetherModelId"
diff --git a/src/core/storage/state.ts b/src/core/storage/state.ts
@@ -101,6 +101,7 @@ export async function getAllExtensionState(context: vscode.ExtensionContext) {
 		vsCodeLmModelSelector,
 		liteLlmBaseUrl,
 		liteLlmModelId,
+		liteLlmUsePromptCache,
 		userInfo,
 		previousModeApiProvider,
 		previousModeModelId,
@@ -166,6 +167,7 @@ export async function getAllExtensionState(context: vscode.ExtensionContext) {
 		getGlobalState(context, "vsCodeLmModelSelector") as Promise<vscode.LanguageModelChatSelector | undefined>,
 		getGlobalState(context, "liteLlmBaseUrl") as Promise<string | undefined>,
 		getGlobalState(context, "liteLlmModelId") as Promise<string | undefined>,
+		getGlobalState(context, "liteLlmUsePromptCache") as Promise<boolean | undefined>,
 		getGlobalState(context, "userInfo") as Promise<UserInfo | undefined>,
 		getGlobalState(context, "previousModeApiProvider") as Promise<ApiProvider | undefined>,
 		getGlobalState(context, "previousModeModelId") as Promise<string | undefined>,
@@ -268,6 +270,7 @@ export async function getAllExtensionState(context: vscode.ExtensionContext) {
 			liteLlmBaseUrl,
 			liteLlmModelId,
 			liteLlmApiKey,
+			liteLlmUsePromptCache,
 			asksageApiKey,
 			asksageApiUrl,
 			xaiApiKey,
@@ -336,6 +339,7 @@ export async function updateApiConfiguration(context: vscode.ExtensionContext, a
 		liteLlmBaseUrl,
 		liteLlmModelId,
 		liteLlmApiKey,
+		liteLlmUsePromptCache,
 		qwenApiLine,
 		asksageApiKey,
 		asksageApiUrl,
@@ -386,6 +390,7 @@ export async function updateApiConfiguration(context: vscode.ExtensionContext, a
 	await updateGlobalState(context, "vsCodeLmModelSelector", vsCodeLmModelSelector)
 	await updateGlobalState(context, "liteLlmBaseUrl", liteLlmBaseUrl)
 	await updateGlobalState(context, "liteLlmModelId", liteLlmModelId)
+	await updateGlobalState(context, "liteLlmUsePromptCache", liteLlmUsePromptCache)
 	await updateGlobalState(context, "qwenApiLine", qwenApiLine)
 	await updateGlobalState(context, "requestyModelId", requestyModelId)
 	await updateGlobalState(context, "togetherModelId", togetherModelId)
diff --git a/src/shared/api.ts b/src/shared/api.ts
@@ -29,6 +29,7 @@ export interface ApiHandlerOptions {
 	liteLlmBaseUrl?: string
 	liteLlmModelId?: string
 	liteLlmApiKey?: string
+	liteLlmUsePromptCache?: boolean
 	anthropicBaseUrl?: string
 	openRouterApiKey?: string
 	openRouterModelId?: string
@@ -1239,9 +1240,11 @@ export const liteLlmModelInfoSaneDefaults: ModelInfo = {
 	maxTokens: -1,
 	contextWindow: 128_000,
 	supportsImages: true,
-	supportsPromptCache: false,
+	supportsPromptCache: true,
 	inputPrice: 0,
 	outputPrice: 0,
+	cacheWritesPrice: 0,
+	cacheReadsPrice: 0,
 }
 
 // AskSage Models
diff --git a/webview-ui/src/components/settings/ApiOptions.tsx b/webview-ui/src/components/settings/ApiOptions.tsx
@@ -47,6 +47,7 @@ import {
 	sambanovaDefaultModelId,
 	doubaoModels,
 	doubaoDefaultModelId,
+	liteLlmModelInfoSaneDefaults,
 } from "@shared/api"
 import { ExtensionMessage } from "@shared/ExtensionMessage"
 import { useExtensionState } from "@/context/ExtensionStateContext"
@@ -1240,6 +1241,28 @@ const ApiOptions = ({ showModelOptions, apiErrorMessage, modelIdErrorMessage, is
 						<span style={{ fontWeight: 500 }}>Model ID</span>
 					</VSCodeTextField>
 
+					<div style={{ display: "flex", flexDirection: "column", marginTop: 10, marginBottom: 10 }}>
+						{selectedModelInfo.supportsPromptCache && (
+							<>
+								<VSCodeCheckbox
+									checked={apiConfiguration?.liteLlmUsePromptCache || false}
+									onChange={(e: any) => {
+										const isChecked = e.target.checked === true
+										setApiConfiguration({
+											...apiConfiguration,
+											liteLlmUsePromptCache: isChecked,
+										})
+									}}
+									style={{ fontWeight: 500, color: "var(--vscode-charts-green)" }}>
+									Use prompt caching (GA)
+								</VSCodeCheckbox>
+								<p style={{ fontSize: "12px", marginTop: 3, color: "var(--vscode-charts-green)" }}>
+									Prompt caching requires a supported provider and model
+								</p>
+							</>
+						)}
+					</div>
+
 					<>
 						<ThinkingBudgetSlider apiConfiguration={apiConfiguration} setApiConfiguration={setApiConfiguration} />
 						<p
@@ -1778,7 +1801,7 @@ export function normalizeApiConfiguration(apiConfiguration?: ApiConfiguration):
 			return {
 				selectedProvider: provider,
 				selectedModelId: apiConfiguration?.liteLlmModelId || "",
-				selectedModelInfo: openAiModelInfoSaneDefaults,
+				selectedModelInfo: liteLlmModelInfoSaneDefaults,
 			}
 		case "xai":
 			return getProviderData(xaiModels, xaiDefaultModelId)

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"claude-dev": patch
 +---
++
 +allow enabling prompt caching for LiteLLM + Claude