From a1125ce31e0404ef5fdb611a5e8dc0c296d2bf8d Mon Sep 17 00:00:00 2001
From: Roo Code <roomote@roocode.com>
Date: Wed, 10 Sep 2025 15:20:41 +0000
Subject: [PATCH] feat: add maxInputTokens parameter to enforce per-request
 token limits

- Add maxInputTokens field to ProviderSettings schema to allow configuring max input tokens per request
- Update sliding window logic to respect maxInputTokens limit when provided
- Use the more restrictive limit between maxInputTokens and context window
- Add comprehensive tests for the new functionality

This addresses the issue where Gemini 2.5 Pro free tier users cannot enforce the 125k input token limit, causing 429 errors. Users can now set maxInputTokens: 125000 in their API configuration to stay within the free tier limits.

Fixes #7853
---
 packages/types/src/provider-settings.ts       |   3 +
 .../__tests__/sliding-window.spec.ts          | 110 ++++++++++++++++++
 src/core/sliding-window/index.ts              |  14 ++-
 src/core/task/Task.ts                         |   2 +
 4 files changed, 127 insertions(+), 2 deletions(-)

diff --git a/packages/types/src/provider-settings.ts b/packages/types/src/provider-settings.ts
index 4dfeacbf07..e7b2adcc31 100644
--- a/packages/types/src/provider-settings.ts
+++ b/packages/types/src/provider-settings.ts
@@ -111,6 +111,9 @@ const baseProviderSettingsSchema = z.object({
 	modelMaxTokens: z.number().optional(),
 	modelMaxThinkingTokens: z.number().optional(),
 
+	// Model input token limit (for providers with per-request limits like Gemini free tier)
+	maxInputTokens: z.number().min(1).optional(),
+
 	// Model verbosity.
 	verbosity: verbosityLevelsSchema.optional(),
 })
diff --git a/src/core/sliding-window/__tests__/sliding-window.spec.ts b/src/core/sliding-window/__tests__/sliding-window.spec.ts
index 0f2c70c81b..6cfccb5a18 100644
--- a/src/core/sliding-window/__tests__/sliding-window.spec.ts
+++ b/src/core/sliding-window/__tests__/sliding-window.spec.ts
@@ -1243,5 +1243,115 @@ describe("Sliding Window", () => {
 			expect(result2).not.toEqual(messagesWithSmallContent)
 			expect(result2.messages.length).toBe(3) // Truncated with 0.5 fraction
 		})
+
+		it("should respect maxInputTokens limit when provided", async () => {
+			const modelInfo = createModelInfo(200000, 8192)
+			const messagesWithSmallContent = [
+				...messages.slice(0, -1),
+				{ ...messages[messages.length - 1], content: "" },
+			]
+
+			// Test with maxInputTokens set to 125000 (Gemini free tier limit)
+			const result = await truncateConversationIfNeeded({
+				messages: messagesWithSmallContent,
+				totalTokens: 150000, // Above the maxInputTokens limit
+				contextWindow: modelInfo.contextWindow,
+				maxTokens: modelInfo.maxTokens,
+				maxInputTokens: 125000, // Gemini free tier limit
+				apiHandler: mockApiHandler,
+				autoCondenseContext: false,
+				autoCondenseContextPercent: 100,
+				systemPrompt: "test",
+				taskId,
+				profileThresholds: {},
+				currentProfileId: "default",
+			})
+
+			// Should truncate because total tokens exceed maxInputTokens
+			expect(result.messages).toHaveLength(3)
+			expect(result.summary).toBe("")
+			expect(result.cost).toBe(0)
+			expect(result.prevContextTokens).toBe(150000)
+		})
+
+		it("should use the more restrictive limit between maxInputTokens and context window", async () => {
+			const modelInfo = createModelInfo(200000, 8192)
+			const messagesWithSmallContent = [
+				...messages.slice(0, -1),
+				{ ...messages[messages.length - 1], content: "" },
+			]
+
+			// Test where context window limit is more restrictive
+			// Context window limit: 200000 * 0.9 - 8192 = 171808
+			const result1 = await truncateConversationIfNeeded({
+				messages: messagesWithSmallContent,
+				totalTokens: 171809, // Just above context window limit
+				contextWindow: modelInfo.contextWindow,
+				maxTokens: modelInfo.maxTokens,
+				maxInputTokens: 300000, // Higher than context window
+				apiHandler: mockApiHandler,
+				autoCondenseContext: false,
+				autoCondenseContextPercent: 100,
+				systemPrompt: "test",
+				taskId,
+				profileThresholds: {},
+				currentProfileId: "default",
+			})
+
+			// Should truncate based on context window limit
+			expect(result1.messages).toHaveLength(3)
+
+			// Test where maxInputTokens is more restrictive
+			const result2 = await truncateConversationIfNeeded({
+				messages: messagesWithSmallContent,
+				totalTokens: 100000,
+				contextWindow: modelInfo.contextWindow,
+				maxTokens: modelInfo.maxTokens,
+				maxInputTokens: 50000, // Lower than current tokens
+				apiHandler: mockApiHandler,
+				autoCondenseContext: false,
+				autoCondenseContextPercent: 100,
+				systemPrompt: "test",
+				taskId,
+				profileThresholds: {},
+				currentProfileId: "default",
+			})
+
+			// Should truncate based on maxInputTokens limit
+			expect(result2.messages).toHaveLength(3)
+			expect(result2.summary).toBe("")
+			expect(result2.cost).toBe(0)
+			expect(result2.prevContextTokens).toBe(100000)
+		})
+
+		it("should not truncate when maxInputTokens is not exceeded", async () => {
+			const modelInfo = createModelInfo(200000, 8192)
+			const messagesWithSmallContent = [
+				...messages.slice(0, -1),
+				{ ...messages[messages.length - 1], content: "" },
+			]
+
+			// Test with tokens below maxInputTokens limit
+			const result = await truncateConversationIfNeeded({
+				messages: messagesWithSmallContent,
+				totalTokens: 50000, // Below the maxInputTokens limit
+				contextWindow: modelInfo.contextWindow,
+				maxTokens: modelInfo.maxTokens,
+				maxInputTokens: 125000, // Gemini free tier limit
+				apiHandler: mockApiHandler,
+				autoCondenseContext: false,
+				autoCondenseContextPercent: 100,
+				systemPrompt: "test",
+				taskId,
+				profileThresholds: {},
+				currentProfileId: "default",
+			})
+
+			// Should not truncate because total tokens are below maxInputTokens
+			expect(result.messages).toEqual(messagesWithSmallContent)
+			expect(result.summary).toBe("")
+			expect(result.cost).toBe(0)
+			expect(result.prevContextTokens).toBe(50000)
+		})
 	})
 })
diff --git a/src/core/sliding-window/index.ts b/src/core/sliding-window/index.ts
index 1e518c9a56..32183c5fc2 100644
--- a/src/core/sliding-window/index.ts
+++ b/src/core/sliding-window/index.ts
@@ -68,6 +68,7 @@ type TruncateOptions = {
 	totalTokens: number
 	contextWindow: number
 	maxTokens?: number | null
+	maxInputTokens?: number | null
 	apiHandler: ApiHandler
 	autoCondenseContext: boolean
 	autoCondenseContextPercent: number
@@ -93,6 +94,7 @@ export async function truncateConversationIfNeeded({
 	totalTokens,
 	contextWindow,
 	maxTokens,
+	maxInputTokens,
 	apiHandler,
 	autoCondenseContext,
 	autoCondenseContextPercent,
@@ -119,8 +121,16 @@ export async function truncateConversationIfNeeded({
 	const prevContextTokens = totalTokens + lastMessageTokens
 
 	// Calculate available tokens for conversation history
-	// Truncate if we're within TOKEN_BUFFER_PERCENTAGE of the context window
-	const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens
+	// First check if there's a maxInputTokens limit (e.g., for Gemini free tier)
+	let allowedTokens: number
+	if (maxInputTokens && maxInputTokens > 0) {
+		// Use the more restrictive limit between maxInputTokens and context window
+		const contextWindowLimit = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens
+		allowedTokens = Math.min(maxInputTokens, contextWindowLimit)
+	} else {
+		// Truncate if we're within TOKEN_BUFFER_PERCENTAGE of the context window
+		allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens
+	}
 
 	// Determine the effective threshold to use
 	let effectiveThreshold = autoCondenseContextPercent
diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts
index c5be865731..418b0fbdc6 100644
--- a/src/core/task/Task.ts
+++ b/src/core/task/Task.ts
@@ -2456,6 +2456,7 @@ export class Task extends EventEmitter<TaskEvents> implements TaskLike {
 			messages: this.apiConversationHistory,
 			totalTokens: contextTokens || 0,
 			maxTokens,
+			maxInputTokens: this.apiConfiguration.maxInputTokens,
 			contextWindow,
 			apiHandler: this.api,
 			autoCondenseContext: true,
@@ -2571,6 +2572,7 @@ export class Task extends EventEmitter<TaskEvents> implements TaskLike {
 				messages: this.apiConversationHistory,
 				totalTokens: contextTokens,
 				maxTokens,
+				maxInputTokens: this.apiConfiguration.maxInputTokens,
 				contextWindow,
 				apiHandler: this.api,
 				autoCondenseContext,