feat: add maxInputTokens parameter to enforce per-request token limits

roomote · roomote · commit a1125ce31e04 · 2025-09-10T15:20:41.000Z
- Add maxInputTokens field to ProviderSettings schema to allow configuring max input tokens per request - Update sliding window logic to respect maxInputTokens limit when provided - Use the more restrictive limit between maxInputTokens and context window - Add comprehensive tests for the new functionality This addresses the issue where Gemini 2.5 Pro free tier users cannot enforce the 125k input token limit, causing 429 errors. Users can now set maxInputTokens: 125000 in their API configuration to stay within the free tier limits. Fixes #7853
diff --git a/packages/types/src/provider-settings.ts b/packages/types/src/provider-settings.ts
@@ -111,6 +111,9 @@ const baseProviderSettingsSchema = z.object({
 	modelMaxTokens: z.number().optional(),
 	modelMaxThinkingTokens: z.number().optional(),
 
+	// Model input token limit (for providers with per-request limits like Gemini free tier)
+	maxInputTokens: z.number().min(1).optional(),
+
 	// Model verbosity.
 	verbosity: verbosityLevelsSchema.optional(),
 })
diff --git a/src/core/sliding-window/__tests__/sliding-window.spec.ts b/src/core/sliding-window/__tests__/sliding-window.spec.ts
@@ -1243,5 +1243,115 @@ describe("Sliding Window", () => {
 			expect(result2).not.toEqual(messagesWithSmallContent)
 			expect(result2.messages.length).toBe(3) // Truncated with 0.5 fraction
 		})
+
+		it("should respect maxInputTokens limit when provided", async () => {
+			const modelInfo = createModelInfo(200000, 8192)
+			const messagesWithSmallContent = [
+				...messages.slice(0, -1),
+				{ ...messages[messages.length - 1], content: "" },
+			]
+
+			// Test with maxInputTokens set to 125000 (Gemini free tier limit)
+			const result = await truncateConversationIfNeeded({
+				messages: messagesWithSmallContent,
+				totalTokens: 150000, // Above the maxInputTokens limit
+				contextWindow: modelInfo.contextWindow,
+				maxTokens: modelInfo.maxTokens,
+				maxInputTokens: 125000, // Gemini free tier limit
+				apiHandler: mockApiHandler,
+				autoCondenseContext: false,
+				autoCondenseContextPercent: 100,
+				systemPrompt: "test",
+				taskId,
+				profileThresholds: {},
+				currentProfileId: "default",
+			})
+
+			// Should truncate because total tokens exceed maxInputTokens
+			expect(result.messages).toHaveLength(3)
+			expect(result.summary).toBe("")
+			expect(result.cost).toBe(0)
+			expect(result.prevContextTokens).toBe(150000)
+		})
+
+		it("should use the more restrictive limit between maxInputTokens and context window", async () => {
+			const modelInfo = createModelInfo(200000, 8192)
+			const messagesWithSmallContent = [
+				...messages.slice(0, -1),
+				{ ...messages[messages.length - 1], content: "" },
+			]
+
+			// Test where context window limit is more restrictive
+			// Context window limit: 200000 * 0.9 - 8192 = 171808
+			const result1 = await truncateConversationIfNeeded({
+				messages: messagesWithSmallContent,
+				totalTokens: 171809, // Just above context window limit
+				contextWindow: modelInfo.contextWindow,
+				maxTokens: modelInfo.maxTokens,
+				maxInputTokens: 300000, // Higher than context window
+				apiHandler: mockApiHandler,
+				autoCondenseContext: false,
+				autoCondenseContextPercent: 100,
+				systemPrompt: "test",
+				taskId,
+				profileThresholds: {},
+				currentProfileId: "default",
+			})
+
+			// Should truncate based on context window limit
+			expect(result1.messages).toHaveLength(3)
+
+			// Test where maxInputTokens is more restrictive
+			const result2 = await truncateConversationIfNeeded({
+				messages: messagesWithSmallContent,
+				totalTokens: 100000,
+				contextWindow: modelInfo.contextWindow,
+				maxTokens: modelInfo.maxTokens,
+				maxInputTokens: 50000, // Lower than current tokens
+				apiHandler: mockApiHandler,
+				autoCondenseContext: false,
+				autoCondenseContextPercent: 100,
+				systemPrompt: "test",
+				taskId,
+				profileThresholds: {},
+				currentProfileId: "default",
+			})
+
+			// Should truncate based on maxInputTokens limit
+			expect(result2.messages).toHaveLength(3)
+			expect(result2.summary).toBe("")
+			expect(result2.cost).toBe(0)
+			expect(result2.prevContextTokens).toBe(100000)
+		})
+
+		it("should not truncate when maxInputTokens is not exceeded", async () => {
+			const modelInfo = createModelInfo(200000, 8192)
+			const messagesWithSmallContent = [
+				...messages.slice(0, -1),
+				{ ...messages[messages.length - 1], content: "" },
+			]
+
+			// Test with tokens below maxInputTokens limit
+			const result = await truncateConversationIfNeeded({
+				messages: messagesWithSmallContent,
+				totalTokens: 50000, // Below the maxInputTokens limit
+				contextWindow: modelInfo.contextWindow,
+				maxTokens: modelInfo.maxTokens,
+				maxInputTokens: 125000, // Gemini free tier limit
+				apiHandler: mockApiHandler,
+				autoCondenseContext: false,
+				autoCondenseContextPercent: 100,
+				systemPrompt: "test",
+				taskId,
+				profileThresholds: {},
+				currentProfileId: "default",
+			})
+
+			// Should not truncate because total tokens are below maxInputTokens
+			expect(result.messages).toEqual(messagesWithSmallContent)
+			expect(result.summary).toBe("")
+			expect(result.cost).toBe(0)
+			expect(result.prevContextTokens).toBe(50000)
+		})
 	})
 })
diff --git a/src/core/sliding-window/index.ts b/src/core/sliding-window/index.ts
@@ -68,6 +68,7 @@ type TruncateOptions = {
 	totalTokens: number
 	contextWindow: number
 	maxTokens?: number | null
+	maxInputTokens?: number | null
 	apiHandler: ApiHandler
 	autoCondenseContext: boolean
 	autoCondenseContextPercent: number
@@ -93,6 +94,7 @@ export async function truncateConversationIfNeeded({
 	totalTokens,
 	contextWindow,
 	maxTokens,
+	maxInputTokens,
 	apiHandler,
 	autoCondenseContext,
 	autoCondenseContextPercent,
@@ -119,8 +121,16 @@ export async function truncateConversationIfNeeded({
 	const prevContextTokens = totalTokens + lastMessageTokens
 
 	// Calculate available tokens for conversation history
-	// Truncate if we're within TOKEN_BUFFER_PERCENTAGE of the context window
-	const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens
+	// First check if there's a maxInputTokens limit (e.g., for Gemini free tier)
+	let allowedTokens: number
+	if (maxInputTokens && maxInputTokens > 0) {
+		// Use the more restrictive limit between maxInputTokens and context window
+		const contextWindowLimit = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens
+		allowedTokens = Math.min(maxInputTokens, contextWindowLimit)
+	} else {
+		// Truncate if we're within TOKEN_BUFFER_PERCENTAGE of the context window
+		allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens
+	}
 
 	// Determine the effective threshold to use
 	let effectiveThreshold = autoCondenseContextPercent
diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts
@@ -2456,6 +2456,7 @@ export class Task extends EventEmitter<TaskEvents> implements TaskLike {
 			messages: this.apiConversationHistory,
 			totalTokens: contextTokens || 0,
 			maxTokens,
+			maxInputTokens: this.apiConfiguration.maxInputTokens,
 			contextWindow,
 			apiHandler: this.api,
 			autoCondenseContext: true,
@@ -2571,6 +2572,7 @@ export class Task extends EventEmitter<TaskEvents> implements TaskLike {
 				messages: this.apiConversationHistory,
 				totalTokens: contextTokens,
 				maxTokens,
+				maxInputTokens: this.apiConfiguration.maxInputTokens,
 				contextWindow,
 				apiHandler: this.api,
 				autoCondenseContext,