diff --git a/packages/types/src/provider-settings.ts b/packages/types/src/provider-settings.ts index 4dfeacbf07..e7b2adcc31 100644 --- a/packages/types/src/provider-settings.ts +++ b/packages/types/src/provider-settings.ts @@ -111,6 +111,9 @@ const baseProviderSettingsSchema = z.object({ modelMaxTokens: z.number().optional(), modelMaxThinkingTokens: z.number().optional(), + // Model input token limit (for providers with per-request limits like Gemini free tier) + maxInputTokens: z.number().min(1).optional(), + // Model verbosity. verbosity: verbosityLevelsSchema.optional(), }) diff --git a/src/core/sliding-window/__tests__/sliding-window.spec.ts b/src/core/sliding-window/__tests__/sliding-window.spec.ts index 0f2c70c81b..6cfccb5a18 100644 --- a/src/core/sliding-window/__tests__/sliding-window.spec.ts +++ b/src/core/sliding-window/__tests__/sliding-window.spec.ts @@ -1243,5 +1243,115 @@ describe("Sliding Window", () => { expect(result2).not.toEqual(messagesWithSmallContent) expect(result2.messages.length).toBe(3) // Truncated with 0.5 fraction }) + + it("should respect maxInputTokens limit when provided", async () => { + const modelInfo = createModelInfo(200000, 8192) + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + // Test with maxInputTokens set to 125000 (Gemini free tier limit) + const result = await truncateConversationIfNeeded({ + messages: messagesWithSmallContent, + totalTokens: 150000, // Above the maxInputTokens limit + contextWindow: modelInfo.contextWindow, + maxTokens: modelInfo.maxTokens, + maxInputTokens: 125000, // Gemini free tier limit + apiHandler: mockApiHandler, + autoCondenseContext: false, + autoCondenseContextPercent: 100, + systemPrompt: "test", + taskId, + profileThresholds: {}, + currentProfileId: "default", + }) + + // Should truncate because total tokens exceed maxInputTokens + expect(result.messages).toHaveLength(3) + expect(result.summary).toBe("") + expect(result.cost).toBe(0) + expect(result.prevContextTokens).toBe(150000) + }) + + it("should use the more restrictive limit between maxInputTokens and context window", async () => { + const modelInfo = createModelInfo(200000, 8192) + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + // Test where context window limit is more restrictive + // Context window limit: 200000 * 0.9 - 8192 = 171808 + const result1 = await truncateConversationIfNeeded({ + messages: messagesWithSmallContent, + totalTokens: 171809, // Just above context window limit + contextWindow: modelInfo.contextWindow, + maxTokens: modelInfo.maxTokens, + maxInputTokens: 300000, // Higher than context window + apiHandler: mockApiHandler, + autoCondenseContext: false, + autoCondenseContextPercent: 100, + systemPrompt: "test", + taskId, + profileThresholds: {}, + currentProfileId: "default", + }) + + // Should truncate based on context window limit + expect(result1.messages).toHaveLength(3) + + // Test where maxInputTokens is more restrictive + const result2 = await truncateConversationIfNeeded({ + messages: messagesWithSmallContent, + totalTokens: 100000, + contextWindow: modelInfo.contextWindow, + maxTokens: modelInfo.maxTokens, + maxInputTokens: 50000, // Lower than current tokens + apiHandler: mockApiHandler, + autoCondenseContext: false, + autoCondenseContextPercent: 100, + systemPrompt: "test", + taskId, + profileThresholds: {}, + currentProfileId: "default", + }) + + // Should truncate based on maxInputTokens limit + expect(result2.messages).toHaveLength(3) + expect(result2.summary).toBe("") + expect(result2.cost).toBe(0) + expect(result2.prevContextTokens).toBe(100000) + }) + + it("should not truncate when maxInputTokens is not exceeded", async () => { + const modelInfo = createModelInfo(200000, 8192) + const messagesWithSmallContent = [ + ...messages.slice(0, -1), + { ...messages[messages.length - 1], content: "" }, + ] + + // Test with tokens below maxInputTokens limit + const result = await truncateConversationIfNeeded({ + messages: messagesWithSmallContent, + totalTokens: 50000, // Below the maxInputTokens limit + contextWindow: modelInfo.contextWindow, + maxTokens: modelInfo.maxTokens, + maxInputTokens: 125000, // Gemini free tier limit + apiHandler: mockApiHandler, + autoCondenseContext: false, + autoCondenseContextPercent: 100, + systemPrompt: "test", + taskId, + profileThresholds: {}, + currentProfileId: "default", + }) + + // Should not truncate because total tokens are below maxInputTokens + expect(result.messages).toEqual(messagesWithSmallContent) + expect(result.summary).toBe("") + expect(result.cost).toBe(0) + expect(result.prevContextTokens).toBe(50000) + }) }) }) diff --git a/src/core/sliding-window/index.ts b/src/core/sliding-window/index.ts index 1e518c9a56..32183c5fc2 100644 --- a/src/core/sliding-window/index.ts +++ b/src/core/sliding-window/index.ts @@ -68,6 +68,7 @@ type TruncateOptions = { totalTokens: number contextWindow: number maxTokens?: number | null + maxInputTokens?: number | null apiHandler: ApiHandler autoCondenseContext: boolean autoCondenseContextPercent: number @@ -93,6 +94,7 @@ export async function truncateConversationIfNeeded({ totalTokens, contextWindow, maxTokens, + maxInputTokens, apiHandler, autoCondenseContext, autoCondenseContextPercent, @@ -119,8 +121,16 @@ export async function truncateConversationIfNeeded({ const prevContextTokens = totalTokens + lastMessageTokens // Calculate available tokens for conversation history - // Truncate if we're within TOKEN_BUFFER_PERCENTAGE of the context window - const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens + // First check if there's a maxInputTokens limit (e.g., for Gemini free tier) + let allowedTokens: number + if (maxInputTokens && maxInputTokens > 0) { + // Use the more restrictive limit between maxInputTokens and context window + const contextWindowLimit = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens + allowedTokens = Math.min(maxInputTokens, contextWindowLimit) + } else { + // Truncate if we're within TOKEN_BUFFER_PERCENTAGE of the context window + allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens + } // Determine the effective threshold to use let effectiveThreshold = autoCondenseContextPercent diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index c5be865731..418b0fbdc6 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -2456,6 +2456,7 @@ export class Task extends EventEmitter implements TaskLike { messages: this.apiConversationHistory, totalTokens: contextTokens || 0, maxTokens, + maxInputTokens: this.apiConfiguration.maxInputTokens, contextWindow, apiHandler: this.api, autoCondenseContext: true, @@ -2571,6 +2572,7 @@ export class Task extends EventEmitter implements TaskLike { messages: this.apiConversationHistory, totalTokens: contextTokens, maxTokens, + maxInputTokens: this.apiConfiguration.maxInputTokens, contextWindow, apiHandler: this.api, autoCondenseContext,