Merge pull request #675 from nissa-seru/truncation-updates

mrubens · web-flow · commit 4a42feca72a9 · 2025-02-05T15:37:25.000-05:00
Enable separate config for truncation for models without context caching
diff --git a/src/core/Cline.ts b/src/core/Cline.ts
@@ -53,7 +53,7 @@ import { AssistantMessageContent, parseAssistantMessage, ToolParamName, ToolUseN
 import { formatResponse } from "./prompts/responses"
 import { SYSTEM_PROMPT } from "./prompts/system"
 import { modes, defaultModeSlug, getModeBySlug } from "../shared/modes"
-import { truncateHalfConversation } from "./sliding-window"
+import { truncateConversationIfNeeded } from "./sliding-window"
 import { ClineProvider, GlobalFileNames } from "./webview/ClineProvider"
 import { detectCodeOmission } from "../integrations/editor/detect-omission"
 import { BrowserSession } from "../services/browser/BrowserSession"
@@ -876,18 +876,25 @@ export class Cline {
 
 		// If the previous API request's total token usage is close to the context window, truncate the conversation history to free up space for the new request
 		if (previousApiReqIndex >= 0) {
-			const previousRequest = this.clineMessages[previousApiReqIndex]
-			if (previousRequest && previousRequest.text) {
-				const { tokensIn, tokensOut, cacheWrites, cacheReads }: ClineApiReqInfo = JSON.parse(
-					previousRequest.text,
-				)
-				const totalTokens = (tokensIn || 0) + (tokensOut || 0) + (cacheWrites || 0) + (cacheReads || 0)
-				const contextWindow = this.api.getModel().info.contextWindow || 128_000
-				const maxAllowedSize = Math.max(contextWindow - 40_000, contextWindow * 0.8)
-				if (totalTokens >= maxAllowedSize) {
-					const truncatedMessages = truncateHalfConversation(this.apiConversationHistory)
-					await this.overwriteApiConversationHistory(truncatedMessages)
-				}
+			const previousRequest = this.clineMessages[previousApiReqIndex]?.text
+			if (!previousRequest) return
+
+			const {
+				tokensIn = 0,
+				tokensOut = 0,
+				cacheWrites = 0,
+				cacheReads = 0,
+			}: ClineApiReqInfo = JSON.parse(previousRequest)
+			const totalTokens = tokensIn + tokensOut + cacheWrites + cacheReads
+
+			const trimmedMessages = truncateConversationIfNeeded(
+				this.apiConversationHistory,
+				totalTokens,
+				this.api.getModel().info,
+			)
+
+			if (trimmedMessages !== this.apiConversationHistory) {
+				await this.overwriteApiConversationHistory(trimmedMessages)
 			}
 		}
 
diff --git a/src/core/sliding-window/index.ts b/src/core/sliding-window/index.ts
@@ -1,26 +1,97 @@
 import { Anthropic } from "@anthropic-ai/sdk"
+import { ModelInfo } from "../../shared/api"
 
-/*
-We can't implement a dynamically updating sliding window as it would break prompt cache
-every time. To maintain the benefits of caching, we need to keep conversation history
-static. This operation should be performed as infrequently as possible. If a user reaches
-a 200k context, we can assume that the first half is likely irrelevant to their current task.
-Therefore, this function should only be called when absolutely necessary to fit within
-context limits, not as a continuous process.
-*/
-export function truncateHalfConversation(
+/**
+ * Truncates a conversation by removing a fraction of the messages.
+ *
+ * The first message is always retained, and a specified fraction (rounded to an even number)
+ * of messages from the beginning (excluding the first) is removed.
+ *
+ * @param {Anthropic.Messages.MessageParam[]} messages - The conversation messages.
+ * @param {number} fracToRemove - The fraction (between 0 and 1) of messages (excluding the first) to remove.
+ * @returns {Anthropic.Messages.MessageParam[]} The truncated conversation messages.
+ */
+export function truncateConversation(
 	messages: Anthropic.Messages.MessageParam[],
+	fracToRemove: number,
 ): Anthropic.Messages.MessageParam[] {
-	// API expects messages to be in user-assistant order, and tool use messages must be followed by tool results. We need to maintain this structure while truncating.
-
-	// Always keep the first Task message (this includes the project's file structure in environment_details)
 	const truncatedMessages = [messages[0]]
-
-	// Remove half of user-assistant pairs
-	const messagesToRemove = Math.floor(messages.length / 4) * 2 // has to be even number
-
-	const remainingMessages = messages.slice(messagesToRemove + 1) // has to start with assistant message since tool result cannot follow assistant message with no tool use
+	const rawMessagesToRemove = Math.floor((messages.length - 1) * fracToRemove)
+	const messagesToRemove = rawMessagesToRemove - (rawMessagesToRemove % 2)
+	const remainingMessages = messages.slice(messagesToRemove + 1)
 	truncatedMessages.push(...remainingMessages)
 
 	return truncatedMessages
 }
+
+/**
+ * Conditionally truncates the conversation messages if the total token count exceeds the model's limit.
+ *
+ * Depending on whether the model supports prompt caching, different maximum token thresholds
+ * and truncation fractions are used. If the current total tokens exceed the threshold,
+ * the conversation is truncated using the appropriate fraction.
+ *
+ * @param {Anthropic.Messages.MessageParam[]} messages - The conversation messages.
+ * @param {number} totalTokens - The total number of tokens in the conversation.
+ * @param {ModelInfo} modelInfo - Model metadata including context window size and prompt cache support.
+ * @returns {Anthropic.Messages.MessageParam[]} The original or truncated conversation messages.
+ */
+export function truncateConversationIfNeeded(
+	messages: Anthropic.Messages.MessageParam[],
+	totalTokens: number,
+	modelInfo: ModelInfo,
+): Anthropic.Messages.MessageParam[] {
+	if (modelInfo.supportsPromptCache) {
+		return totalTokens < getMaxTokensForPromptCachingModels(modelInfo)
+			? messages
+			: truncateConversation(messages, getTruncFractionForPromptCachingModels(modelInfo))
+	} else {
+		return totalTokens < getMaxTokensForNonPromptCachingModels(modelInfo)
+			? messages
+			: truncateConversation(messages, getTruncFractionForNonPromptCachingModels(modelInfo))
+	}
+}
+
+/**
+ * Calculates the maximum allowed tokens for models that support prompt caching.
+ *
+ * The maximum is computed as the greater of (contextWindow - 40000) and 80% of the contextWindow.
+ *
+ * @param {ModelInfo} modelInfo - The model information containing the context window size.
+ * @returns {number} The maximum number of tokens allowed for prompt caching models.
+ */
+function getMaxTokensForPromptCachingModels(modelInfo: ModelInfo): number {
+	return Math.max(modelInfo.contextWindow - 40_000, modelInfo.contextWindow * 0.8)
+}
+
+/**
+ * Provides the fraction of messages to remove for models that support prompt caching.
+ *
+ * @param {ModelInfo} modelInfo - The model information (unused in current implementation).
+ * @returns {number} The truncation fraction for prompt caching models (fixed at 0.5).
+ */
+function getTruncFractionForPromptCachingModels(modelInfo: ModelInfo): number {
+	return 0.5
+}
+
+/**
+ * Calculates the maximum allowed tokens for models that do not support prompt caching.
+ *
+ * The maximum is computed as the greater of (contextWindow - 40000) and 80% of the contextWindow.
+ *
+ * @param {ModelInfo} modelInfo - The model information containing the context window size.
+ * @returns {number} The maximum number of tokens allowed for non-prompt caching models.
+ */
+function getMaxTokensForNonPromptCachingModels(modelInfo: ModelInfo): number {
+	return Math.max(modelInfo.contextWindow - 40_000, modelInfo.contextWindow * 0.8)
+}
+
+/**
+ * Provides the fraction of messages to remove for models that do not support prompt caching.
+ *
+ * @param {ModelInfo} modelInfo - The model information.
+ * @returns {number} The truncation fraction for non-prompt caching models (fixed at 0.1).
+ */
+function getTruncFractionForNonPromptCachingModels(modelInfo: ModelInfo): number {
+	return Math.min(40_000 / modelInfo.contextWindow, 0.2)
+}
diff --git a/webview-ui/tsconfig.json b/webview-ui/tsconfig.json
@@ -1,11 +1,7 @@
 {
 	"compilerOptions": {
 		"target": "es5",
-		"lib": [
-			"dom",
-			"dom.iterable",
-			"esnext"
-		],
+		"lib": ["dom", "dom.iterable", "esnext"],
 		"allowJs": true,
 		"skipLibCheck": true,
 		"esModuleInterop": true,
@@ -21,13 +17,8 @@
 		"jsx": "react-jsx",
 		"baseUrl": ".",
 		"paths": {
-			"@/*": [
-				"./src/*"
-			]
+			"@/*": ["./src/*"]
 		}
 	},
-	"include": [
-		"src",
-		"../src/shared",
-	]
+	"include": ["src", "../src/shared"]
 }