diff --git a/src/api/providers/__tests__/anthropic-token-counting.test.ts b/src/api/providers/__tests__/anthropic-token-counting.test.ts new file mode 100644 index 00000000000..aed95340a58 --- /dev/null +++ b/src/api/providers/__tests__/anthropic-token-counting.test.ts @@ -0,0 +1,257 @@ +// npx jest src/api/providers/__tests__/anthropic-token-counting.test.ts + +import { Anthropic } from "@anthropic-ai/sdk" +import { AnthropicHandler } from "../anthropic" +import { CLAUDE_MAX_SAFE_TOKEN_LIMIT } from "../constants" +import { ApiHandlerOptions } from "../../../shared/api" + +// Mock the Anthropic client +jest.mock("@anthropic-ai/sdk", () => { + const mockCountTokensResponse = { + input_tokens: 5000, // Default token count + } + + const mockMessageResponse = { + id: "msg_123", + type: "message", + role: "assistant", + content: [{ type: "text", text: "This is a test response" }], + model: "claude-3-7-sonnet-20250219", + stop_reason: "end_turn", + usage: { + input_tokens: 5000, + output_tokens: 100, + }, + } + + // Mock stream implementation + const mockStream = { + [Symbol.asyncIterator]: async function* () { + yield { + type: "message_start", + message: { + id: "msg_123", + type: "message", + role: "assistant", + content: [], + model: "claude-3-7-sonnet-20250219", + stop_reason: null, + usage: { + input_tokens: 5000, + output_tokens: 0, + }, + }, + } + yield { + type: "content_block_start", + index: 0, + content_block: { + type: "text", + text: "This is a test response", + }, + } + yield { + type: "message_delta", + usage: { + output_tokens: 100, + }, + } + yield { + type: "message_stop", + } + }, + } + + return { + Anthropic: jest.fn().mockImplementation(() => { + return { + messages: { + create: jest.fn().mockImplementation((params) => { + if (params.stream) { + return mockStream + } + return mockMessageResponse + }), + countTokens: jest.fn().mockImplementation((params) => { + // If the messages array is very large, simulate a high token count + let tokenCount = mockCountTokensResponse.input_tokens + + if (params.messages && params.messages.length > 10) { + tokenCount = CLAUDE_MAX_SAFE_TOKEN_LIMIT + 10000 + } + + return Promise.resolve({ input_tokens: tokenCount }) + }), + }, + } + }), + } +}) + +describe("AnthropicHandler Token Counting", () => { + // Test with Claude 3.7 Sonnet + describe("with Claude 3.7 Sonnet", () => { + const options: ApiHandlerOptions = { + apiKey: "test-key", + apiModelId: "claude-3-7-sonnet-20250219", + } + + let handler: AnthropicHandler + + beforeEach(() => { + handler = new AnthropicHandler(options) + jest.clearAllMocks() + }) + + it("should count tokens for content blocks", async () => { + const content = [{ type: "text" as const, text: "Hello, world!" }] + const count = await handler.countTokens(content) + expect(count).toBe(5000) // Mock returns 5000 + }) + + it("should count tokens for a complete message", async () => { + const systemPrompt = "You are a helpful assistant." + const messages = [ + { role: "user" as const, content: "Hello!" }, + { role: "assistant" as const, content: "Hi there!" }, + { role: "user" as const, content: "How are you?" }, + ] + + const count = await handler.countMessageTokens(systemPrompt, messages, "claude-3-7-sonnet-20250219") + + expect(count).toBe(5000) // Mock returns 5000 + }) + + it("should truncate conversation when token count exceeds limit", async () => { + // Create a large number of messages to trigger truncation + const systemPrompt = "You are a helpful assistant." + const messages: Anthropic.Messages.MessageParam[] = [] + + // Add 20 messages to exceed the token limit + for (let i = 0; i < 20; i++) { + messages.push({ + role: i % 2 === 0 ? "user" : "assistant", + content: `Message ${i}: This is a test message that should have enough content to trigger the token limit when combined with other messages.`, + }) + } + + // Spy on console.warn to verify warning is logged + const consoleWarnSpy = jest.spyOn(console, "warn").mockImplementation() + const consoleLogSpy = jest.spyOn(console, "log").mockImplementation() + + // Create a message stream + const stream = handler.createMessage(systemPrompt, messages) + + // Consume the stream to trigger the token counting and truncation + for await (const _ of stream) { + // Just consume the stream + } + + // Verify that warnings were logged about token limit + expect(consoleWarnSpy).toHaveBeenCalled() + expect(consoleLogSpy).toHaveBeenCalled() + + // Restore console.warn + consoleWarnSpy.mockRestore() + consoleLogSpy.mockRestore() + }) + }) + + // Test with Claude 3 Opus + describe("with Claude 3 Opus", () => { + const options: ApiHandlerOptions = { + apiKey: "test-key", + apiModelId: "claude-3-opus-20240229", + } + + let handler: AnthropicHandler + + beforeEach(() => { + handler = new AnthropicHandler(options) + jest.clearAllMocks() + }) + + it("should truncate conversation when token count exceeds limit", async () => { + // Create a large number of messages to trigger truncation + const systemPrompt = "You are a helpful assistant." + const messages: Anthropic.Messages.MessageParam[] = [] + + // Add 20 messages to exceed the token limit + for (let i = 0; i < 20; i++) { + messages.push({ + role: i % 2 === 0 ? "user" : "assistant", + content: `Message ${i}: This is a test message that should have enough content to trigger the token limit when combined with other messages.`, + }) + } + + // Spy on console.warn to verify warning is logged + const consoleWarnSpy = jest.spyOn(console, "warn").mockImplementation() + const consoleLogSpy = jest.spyOn(console, "log").mockImplementation() + + // Create a message stream + const stream = handler.createMessage(systemPrompt, messages) + + // Consume the stream to trigger the token counting and truncation + for await (const _ of stream) { + // Just consume the stream + } + + // Verify that warnings were logged about token limit + expect(consoleWarnSpy).toHaveBeenCalled() + expect(consoleLogSpy).toHaveBeenCalled() + + // Restore console.warn + consoleWarnSpy.mockRestore() + consoleLogSpy.mockRestore() + }) + }) + + // Test with Claude 3 Haiku + describe("with Claude 3 Haiku", () => { + const options: ApiHandlerOptions = { + apiKey: "test-key", + apiModelId: "claude-3-haiku-20240307", + } + + let handler: AnthropicHandler + + beforeEach(() => { + handler = new AnthropicHandler(options) + jest.clearAllMocks() + }) + + it("should truncate conversation when token count exceeds limit", async () => { + // Create a large number of messages to trigger truncation + const systemPrompt = "You are a helpful assistant." + const messages: Anthropic.Messages.MessageParam[] = [] + + // Add 20 messages to exceed the token limit + for (let i = 0; i < 20; i++) { + messages.push({ + role: i % 2 === 0 ? "user" : "assistant", + content: `Message ${i}: This is a test message that should have enough content to trigger the token limit when combined with other messages.`, + }) + } + + // Spy on console.warn to verify warning is logged + const consoleWarnSpy = jest.spyOn(console, "warn").mockImplementation() + const consoleLogSpy = jest.spyOn(console, "log").mockImplementation() + + // Create a message stream + const stream = handler.createMessage(systemPrompt, messages) + + // Consume the stream to trigger the token counting and truncation + for await (const _ of stream) { + // Just consume the stream + } + + // Verify that warnings were logged about token limit + expect(consoleWarnSpy).toHaveBeenCalled() + expect(consoleLogSpy).toHaveBeenCalled() + + // Restore console.warn + consoleWarnSpy.mockRestore() + consoleLogSpy.mockRestore() + }) + }) +}) diff --git a/src/api/providers/anthropic.ts b/src/api/providers/anthropic.ts index 5489b326093..72808f6ed31 100644 --- a/src/api/providers/anthropic.ts +++ b/src/api/providers/anthropic.ts @@ -10,8 +10,9 @@ import { } from "../../shared/api" import { ApiStream } from "../transform/stream" import { BaseProvider } from "./base-provider" -import { ANTHROPIC_DEFAULT_MAX_TOKENS } from "./constants" +import { ANTHROPIC_DEFAULT_MAX_TOKENS, CLAUDE_MAX_SAFE_TOKEN_LIMIT } from "./constants" import { SingleCompletionHandler, getModelParams } from "../index" +import { truncateConversation } from "../../core/sliding-window" export class AnthropicHandler extends BaseProvider implements SingleCompletionHandler { private options: ApiHandlerOptions @@ -33,7 +34,61 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa async *createMessage(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): ApiStream { let stream: AnthropicStream const cacheControl: CacheControlEphemeral = { type: "ephemeral" } - let { id: modelId, maxTokens, thinking, temperature, virtualId } = this.getModel() + let { id: modelId, maxTokens, thinking, temperature, virtualId, info } = this.getModel() + + // Check token count before sending the request for all Anthropic models + // Count tokens for the entire request + const tokenCount = await this.countMessageTokens(systemPrompt, messages, modelId) + + // Get the context window size for the current model + const contextWindow = info.contextWindow || 200000 + + // Calculate a safe token limit (1k tokens below the context window) + const safeTokenLimit = Math.min(contextWindow - 1000, CLAUDE_MAX_SAFE_TOKEN_LIMIT) + + // If token count exceeds the safe limit, truncate the conversation + if (tokenCount > safeTokenLimit) { + console.warn( + `Token count (${tokenCount}) exceeds safe limit (${safeTokenLimit}) for model ${modelId}. Truncating conversation.`, + ) + + // Calculate how much we need to truncate + const excessTokens = tokenCount - safeTokenLimit + const totalTokens = tokenCount + + // Determine truncation fraction based on excess tokens + // Start with 0.5 (50%) and increase if needed + let truncationFraction = 0.5 + + // If we're significantly over the limit, increase truncation + if (excessTokens > totalTokens * 0.3) { + truncationFraction = 0.7 + } + + // Truncate the conversation + const originalLength = messages.length + messages = truncateConversation(messages, truncationFraction) + + console.log( + `Truncated conversation from ${originalLength} to ${messages.length} messages to fit within token limit.`, + ) + + // Verify token count after truncation + const newTokenCount = await this.countMessageTokens(systemPrompt, messages, modelId) + + // If still over limit, truncate again with a higher fraction + if (newTokenCount > safeTokenLimit) { + console.warn( + `After truncation, token count (${newTokenCount}) still exceeds safe limit. Truncating further.`, + ) + + messages = truncateConversation(messages, 0.8) + + // Final verification + const finalTokenCount = await this.countMessageTokens(systemPrompt, messages, modelId) + console.log(`Final token count after truncation: ${finalTokenCount}`) + } + } switch (modelId) { case "claude-3-7-sonnet-20250219": @@ -217,7 +272,32 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa } async completePrompt(prompt: string) { - let { id: model, temperature } = this.getModel() + let { id: model, temperature, info } = this.getModel() + + // Check token count before sending the request for all Anthropic models + // Count tokens for the prompt + const tokenCount = await this.countTokens([{ type: "text", text: prompt }]) + + // Get the context window size for the current model + const contextWindow = info.contextWindow || 200000 + + // Calculate a safe token limit (1k tokens below the context window) + const safeTokenLimit = Math.min(contextWindow - 1000, CLAUDE_MAX_SAFE_TOKEN_LIMIT) + + // If token count exceeds the safe limit, truncate the prompt + if (tokenCount > safeTokenLimit) { + console.warn( + `Prompt token count (${tokenCount}) exceeds safe limit (${safeTokenLimit}) for model ${model}. Truncating prompt.`, + ) + + // Calculate how much we need to truncate + const ratio = safeTokenLimit / tokenCount + const newLength = Math.floor(prompt.length * ratio * 0.9) // 90% of the calculated length for safety + + // Truncate the prompt + prompt = prompt.substring(0, newLength) + console.log(`Truncated prompt to ${newLength} characters to fit within token limit.`) + } const message = await this.client.messages.create({ model, @@ -257,4 +337,47 @@ export class AnthropicHandler extends BaseProvider implements SingleCompletionHa return super.countTokens(content) } } + + /** + * Counts tokens for a complete message request using Anthropic's API + * + * @param systemPrompt The system prompt + * @param messages The conversation messages + * @param model The model ID + * @returns A promise resolving to the token count + */ + async countMessageTokens( + systemPrompt: string, + messages: Anthropic.Messages.MessageParam[], + model: string, + ): Promise { + try { + const response = await this.client.messages.countTokens({ + model, + system: systemPrompt, + messages: messages, + }) + + return response.input_tokens + } catch (error) { + // Log error but fallback to estimating tokens by counting each part separately + console.warn("Anthropic message token counting failed, using fallback", error) + + // Fallback: Count system prompt tokens + const systemTokens = await this.countTokens([{ type: "text", text: systemPrompt }]) + + // Count tokens for each message + let messageTokens = 0 + for (const message of messages) { + if (typeof message.content === "string") { + messageTokens += await this.countTokens([{ type: "text", text: message.content }]) + } else { + messageTokens += await this.countTokens(message.content) + } + } + + // Add some overhead for message formatting + return systemTokens + messageTokens + messages.length * 5 + } + } } diff --git a/src/api/providers/constants.ts b/src/api/providers/constants.ts index 4d6c4672e50..d1a00455df1 100644 --- a/src/api/providers/constants.ts +++ b/src/api/providers/constants.ts @@ -5,4 +5,7 @@ export const DEFAULT_HEADERS = { export const ANTHROPIC_DEFAULT_MAX_TOKENS = 8192 +// Maximum safe token limit for Claude 3.7 Sonnet (200k - 1k safety buffer) +export const CLAUDE_MAX_SAFE_TOKEN_LIMIT = 199000 + export const DEEP_SEEK_DEFAULT_TEMPERATURE = 0.6 diff --git a/src/core/__tests__/Cline.test.ts b/src/core/__tests__/Cline.test.ts index 14540c834c3..93b9d4d301e 100644 --- a/src/core/__tests__/Cline.test.ts +++ b/src/core/__tests__/Cline.test.ts @@ -321,6 +321,7 @@ describe("Cline", () => { describe("getEnvironmentDetails", () => { describe("API conversation handling", () => { + // Set timeout to 15 seconds for this specific test it("should clean conversation history before sending to API", async () => { // Cline.create will now use our mocked getEnvironmentDetails const [cline, task] = Cline.create({ @@ -387,7 +388,7 @@ describe("Cline", () => { // Verify extra properties were removed expect(Object.keys(cleanedMessage!)).toEqual(["role", "content"]) - }) + }, 15000) it("should handle image blocks based on model capabilities", async () => { // Create two configurations - one with image support, one without diff --git a/src/core/sliding-window/index.ts b/src/core/sliding-window/index.ts index 75395ecd758..2f0d5731494 100644 --- a/src/core/sliding-window/index.ts +++ b/src/core/sliding-window/index.ts @@ -6,6 +6,12 @@ import { ApiHandler } from "../../api" */ export const TOKEN_BUFFER_PERCENTAGE = 0.1 +/** + * Maximum safe token limit for Claude 3.7 Sonnet (200k - 1k safety buffer) + * This is imported from constants.ts but redefined here to avoid circular dependencies + */ +export const CLAUDE_MAX_SAFE_TOKEN_LIMIT = 199000 + /** * Counts tokens for user content using the provider's token counting implementation. * @@ -91,6 +97,39 @@ export async function truncateConversationIfNeeded({ // Calculate total effective tokens (totalTokens never includes the last message) const effectiveTokens = totalTokens + lastMessageTokens + // Special handling for Anthropic models to ensure we stay under the context window limit + const { id: modelId, info } = apiHandler.getModel() + + // Check if this is an Anthropic model + if (modelId.startsWith("claude-")) { + // Get the context window size for the current model + const modelContextWindow = info.contextWindow || 200000 + + // Calculate a safe token limit (1k tokens below the context window) + const safeTokenLimit = Math.min(modelContextWindow - 1000, CLAUDE_MAX_SAFE_TOKEN_LIMIT) + + if (effectiveTokens > safeTokenLimit) { + console.warn( + `Token count (${effectiveTokens}) exceeds safe limit (${safeTokenLimit}) for model ${modelId}. Using aggressive truncation.`, + ) + + // Calculate how much we need to truncate + const excessTokens = effectiveTokens - safeTokenLimit + + // Determine truncation fraction based on excess tokens + // Start with 0.5 (50%) and increase if needed + let truncationFraction = 0.5 + + // If we're significantly over the limit, increase truncation + if (excessTokens > effectiveTokens * 0.3) { + truncationFraction = 0.7 + } + + return truncateConversation(messages, truncationFraction) + } + } + + // Standard truncation logic for other models // Calculate available tokens for conversation history // Truncate if we're within TOKEN_BUFFER_PERCENTAGE of the context window const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens