Skip to content

Commit 4a42fec

Browse files
authored
Merge pull request #675 from nissa-seru/truncation-updates
Enable separate config for truncation for models without context caching
2 parents 00ec57a + 98c8cc9 commit 4a42fec

File tree

3 files changed

+111
-42
lines changed

3 files changed

+111
-42
lines changed

src/core/Cline.ts

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ import { AssistantMessageContent, parseAssistantMessage, ToolParamName, ToolUseN
5353
import { formatResponse } from "./prompts/responses"
5454
import { SYSTEM_PROMPT } from "./prompts/system"
5555
import { modes, defaultModeSlug, getModeBySlug } from "../shared/modes"
56-
import { truncateHalfConversation } from "./sliding-window"
56+
import { truncateConversationIfNeeded } from "./sliding-window"
5757
import { ClineProvider, GlobalFileNames } from "./webview/ClineProvider"
5858
import { detectCodeOmission } from "../integrations/editor/detect-omission"
5959
import { BrowserSession } from "../services/browser/BrowserSession"
@@ -876,18 +876,25 @@ export class Cline {
876876

877877
// If the previous API request's total token usage is close to the context window, truncate the conversation history to free up space for the new request
878878
if (previousApiReqIndex >= 0) {
879-
const previousRequest = this.clineMessages[previousApiReqIndex]
880-
if (previousRequest && previousRequest.text) {
881-
const { tokensIn, tokensOut, cacheWrites, cacheReads }: ClineApiReqInfo = JSON.parse(
882-
previousRequest.text,
883-
)
884-
const totalTokens = (tokensIn || 0) + (tokensOut || 0) + (cacheWrites || 0) + (cacheReads || 0)
885-
const contextWindow = this.api.getModel().info.contextWindow || 128_000
886-
const maxAllowedSize = Math.max(contextWindow - 40_000, contextWindow * 0.8)
887-
if (totalTokens >= maxAllowedSize) {
888-
const truncatedMessages = truncateHalfConversation(this.apiConversationHistory)
889-
await this.overwriteApiConversationHistory(truncatedMessages)
890-
}
879+
const previousRequest = this.clineMessages[previousApiReqIndex]?.text
880+
if (!previousRequest) return
881+
882+
const {
883+
tokensIn = 0,
884+
tokensOut = 0,
885+
cacheWrites = 0,
886+
cacheReads = 0,
887+
}: ClineApiReqInfo = JSON.parse(previousRequest)
888+
const totalTokens = tokensIn + tokensOut + cacheWrites + cacheReads
889+
890+
const trimmedMessages = truncateConversationIfNeeded(
891+
this.apiConversationHistory,
892+
totalTokens,
893+
this.api.getModel().info,
894+
)
895+
896+
if (trimmedMessages !== this.apiConversationHistory) {
897+
await this.overwriteApiConversationHistory(trimmedMessages)
891898
}
892899
}
893900

src/core/sliding-window/index.ts

Lines changed: 88 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,97 @@
11
import { Anthropic } from "@anthropic-ai/sdk"
2+
import { ModelInfo } from "../../shared/api"
23

3-
/*
4-
We can't implement a dynamically updating sliding window as it would break prompt cache
5-
every time. To maintain the benefits of caching, we need to keep conversation history
6-
static. This operation should be performed as infrequently as possible. If a user reaches
7-
a 200k context, we can assume that the first half is likely irrelevant to their current task.
8-
Therefore, this function should only be called when absolutely necessary to fit within
9-
context limits, not as a continuous process.
10-
*/
11-
export function truncateHalfConversation(
4+
/**
5+
* Truncates a conversation by removing a fraction of the messages.
6+
*
7+
* The first message is always retained, and a specified fraction (rounded to an even number)
8+
* of messages from the beginning (excluding the first) is removed.
9+
*
10+
* @param {Anthropic.Messages.MessageParam[]} messages - The conversation messages.
11+
* @param {number} fracToRemove - The fraction (between 0 and 1) of messages (excluding the first) to remove.
12+
* @returns {Anthropic.Messages.MessageParam[]} The truncated conversation messages.
13+
*/
14+
export function truncateConversation(
1215
messages: Anthropic.Messages.MessageParam[],
16+
fracToRemove: number,
1317
): Anthropic.Messages.MessageParam[] {
14-
// API expects messages to be in user-assistant order, and tool use messages must be followed by tool results. We need to maintain this structure while truncating.
15-
16-
// Always keep the first Task message (this includes the project's file structure in environment_details)
1718
const truncatedMessages = [messages[0]]
18-
19-
// Remove half of user-assistant pairs
20-
const messagesToRemove = Math.floor(messages.length / 4) * 2 // has to be even number
21-
22-
const remainingMessages = messages.slice(messagesToRemove + 1) // has to start with assistant message since tool result cannot follow assistant message with no tool use
19+
const rawMessagesToRemove = Math.floor((messages.length - 1) * fracToRemove)
20+
const messagesToRemove = rawMessagesToRemove - (rawMessagesToRemove % 2)
21+
const remainingMessages = messages.slice(messagesToRemove + 1)
2322
truncatedMessages.push(...remainingMessages)
2423

2524
return truncatedMessages
2625
}
26+
27+
/**
28+
* Conditionally truncates the conversation messages if the total token count exceeds the model's limit.
29+
*
30+
* Depending on whether the model supports prompt caching, different maximum token thresholds
31+
* and truncation fractions are used. If the current total tokens exceed the threshold,
32+
* the conversation is truncated using the appropriate fraction.
33+
*
34+
* @param {Anthropic.Messages.MessageParam[]} messages - The conversation messages.
35+
* @param {number} totalTokens - The total number of tokens in the conversation.
36+
* @param {ModelInfo} modelInfo - Model metadata including context window size and prompt cache support.
37+
* @returns {Anthropic.Messages.MessageParam[]} The original or truncated conversation messages.
38+
*/
39+
export function truncateConversationIfNeeded(
40+
messages: Anthropic.Messages.MessageParam[],
41+
totalTokens: number,
42+
modelInfo: ModelInfo,
43+
): Anthropic.Messages.MessageParam[] {
44+
if (modelInfo.supportsPromptCache) {
45+
return totalTokens < getMaxTokensForPromptCachingModels(modelInfo)
46+
? messages
47+
: truncateConversation(messages, getTruncFractionForPromptCachingModels(modelInfo))
48+
} else {
49+
return totalTokens < getMaxTokensForNonPromptCachingModels(modelInfo)
50+
? messages
51+
: truncateConversation(messages, getTruncFractionForNonPromptCachingModels(modelInfo))
52+
}
53+
}
54+
55+
/**
56+
* Calculates the maximum allowed tokens for models that support prompt caching.
57+
*
58+
* The maximum is computed as the greater of (contextWindow - 40000) and 80% of the contextWindow.
59+
*
60+
* @param {ModelInfo} modelInfo - The model information containing the context window size.
61+
* @returns {number} The maximum number of tokens allowed for prompt caching models.
62+
*/
63+
function getMaxTokensForPromptCachingModels(modelInfo: ModelInfo): number {
64+
return Math.max(modelInfo.contextWindow - 40_000, modelInfo.contextWindow * 0.8)
65+
}
66+
67+
/**
68+
* Provides the fraction of messages to remove for models that support prompt caching.
69+
*
70+
* @param {ModelInfo} modelInfo - The model information (unused in current implementation).
71+
* @returns {number} The truncation fraction for prompt caching models (fixed at 0.5).
72+
*/
73+
function getTruncFractionForPromptCachingModels(modelInfo: ModelInfo): number {
74+
return 0.5
75+
}
76+
77+
/**
78+
* Calculates the maximum allowed tokens for models that do not support prompt caching.
79+
*
80+
* The maximum is computed as the greater of (contextWindow - 40000) and 80% of the contextWindow.
81+
*
82+
* @param {ModelInfo} modelInfo - The model information containing the context window size.
83+
* @returns {number} The maximum number of tokens allowed for non-prompt caching models.
84+
*/
85+
function getMaxTokensForNonPromptCachingModels(modelInfo: ModelInfo): number {
86+
return Math.max(modelInfo.contextWindow - 40_000, modelInfo.contextWindow * 0.8)
87+
}
88+
89+
/**
90+
* Provides the fraction of messages to remove for models that do not support prompt caching.
91+
*
92+
* @param {ModelInfo} modelInfo - The model information.
93+
* @returns {number} The truncation fraction for non-prompt caching models (fixed at 0.1).
94+
*/
95+
function getTruncFractionForNonPromptCachingModels(modelInfo: ModelInfo): number {
96+
return Math.min(40_000 / modelInfo.contextWindow, 0.2)
97+
}

webview-ui/tsconfig.json

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,7 @@
11
{
22
"compilerOptions": {
33
"target": "es5",
4-
"lib": [
5-
"dom",
6-
"dom.iterable",
7-
"esnext"
8-
],
4+
"lib": ["dom", "dom.iterable", "esnext"],
95
"allowJs": true,
106
"skipLibCheck": true,
117
"esModuleInterop": true,
@@ -21,13 +17,8 @@
2117
"jsx": "react-jsx",
2218
"baseUrl": ".",
2319
"paths": {
24-
"@/*": [
25-
"./src/*"
26-
]
20+
"@/*": ["./src/*"]
2721
}
2822
},
29-
"include": [
30-
"src",
31-
"../src/shared",
32-
]
23+
"include": ["src", "../src/shared"]
3324
}

0 commit comments

Comments
 (0)