Skip to content

Commit a1125ce

Browse files
committed
feat: add maxInputTokens parameter to enforce per-request token limits
- Add maxInputTokens field to ProviderSettings schema to allow configuring max input tokens per request - Update sliding window logic to respect maxInputTokens limit when provided - Use the more restrictive limit between maxInputTokens and context window - Add comprehensive tests for the new functionality This addresses the issue where Gemini 2.5 Pro free tier users cannot enforce the 125k input token limit, causing 429 errors. Users can now set maxInputTokens: 125000 in their API configuration to stay within the free tier limits. Fixes #7853
1 parent 7cd6520 commit a1125ce

File tree

4 files changed

+127
-2
lines changed

4 files changed

+127
-2
lines changed

packages/types/src/provider-settings.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,9 @@ const baseProviderSettingsSchema = z.object({
111111
modelMaxTokens: z.number().optional(),
112112
modelMaxThinkingTokens: z.number().optional(),
113113

114+
// Model input token limit (for providers with per-request limits like Gemini free tier)
115+
maxInputTokens: z.number().min(1).optional(),
116+
114117
// Model verbosity.
115118
verbosity: verbosityLevelsSchema.optional(),
116119
})

src/core/sliding-window/__tests__/sliding-window.spec.ts

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1243,5 +1243,115 @@ describe("Sliding Window", () => {
12431243
expect(result2).not.toEqual(messagesWithSmallContent)
12441244
expect(result2.messages.length).toBe(3) // Truncated with 0.5 fraction
12451245
})
1246+
1247+
it("should respect maxInputTokens limit when provided", async () => {
1248+
const modelInfo = createModelInfo(200000, 8192)
1249+
const messagesWithSmallContent = [
1250+
...messages.slice(0, -1),
1251+
{ ...messages[messages.length - 1], content: "" },
1252+
]
1253+
1254+
// Test with maxInputTokens set to 125000 (Gemini free tier limit)
1255+
const result = await truncateConversationIfNeeded({
1256+
messages: messagesWithSmallContent,
1257+
totalTokens: 150000, // Above the maxInputTokens limit
1258+
contextWindow: modelInfo.contextWindow,
1259+
maxTokens: modelInfo.maxTokens,
1260+
maxInputTokens: 125000, // Gemini free tier limit
1261+
apiHandler: mockApiHandler,
1262+
autoCondenseContext: false,
1263+
autoCondenseContextPercent: 100,
1264+
systemPrompt: "test",
1265+
taskId,
1266+
profileThresholds: {},
1267+
currentProfileId: "default",
1268+
})
1269+
1270+
// Should truncate because total tokens exceed maxInputTokens
1271+
expect(result.messages).toHaveLength(3)
1272+
expect(result.summary).toBe("")
1273+
expect(result.cost).toBe(0)
1274+
expect(result.prevContextTokens).toBe(150000)
1275+
})
1276+
1277+
it("should use the more restrictive limit between maxInputTokens and context window", async () => {
1278+
const modelInfo = createModelInfo(200000, 8192)
1279+
const messagesWithSmallContent = [
1280+
...messages.slice(0, -1),
1281+
{ ...messages[messages.length - 1], content: "" },
1282+
]
1283+
1284+
// Test where context window limit is more restrictive
1285+
// Context window limit: 200000 * 0.9 - 8192 = 171808
1286+
const result1 = await truncateConversationIfNeeded({
1287+
messages: messagesWithSmallContent,
1288+
totalTokens: 171809, // Just above context window limit
1289+
contextWindow: modelInfo.contextWindow,
1290+
maxTokens: modelInfo.maxTokens,
1291+
maxInputTokens: 300000, // Higher than context window
1292+
apiHandler: mockApiHandler,
1293+
autoCondenseContext: false,
1294+
autoCondenseContextPercent: 100,
1295+
systemPrompt: "test",
1296+
taskId,
1297+
profileThresholds: {},
1298+
currentProfileId: "default",
1299+
})
1300+
1301+
// Should truncate based on context window limit
1302+
expect(result1.messages).toHaveLength(3)
1303+
1304+
// Test where maxInputTokens is more restrictive
1305+
const result2 = await truncateConversationIfNeeded({
1306+
messages: messagesWithSmallContent,
1307+
totalTokens: 100000,
1308+
contextWindow: modelInfo.contextWindow,
1309+
maxTokens: modelInfo.maxTokens,
1310+
maxInputTokens: 50000, // Lower than current tokens
1311+
apiHandler: mockApiHandler,
1312+
autoCondenseContext: false,
1313+
autoCondenseContextPercent: 100,
1314+
systemPrompt: "test",
1315+
taskId,
1316+
profileThresholds: {},
1317+
currentProfileId: "default",
1318+
})
1319+
1320+
// Should truncate based on maxInputTokens limit
1321+
expect(result2.messages).toHaveLength(3)
1322+
expect(result2.summary).toBe("")
1323+
expect(result2.cost).toBe(0)
1324+
expect(result2.prevContextTokens).toBe(100000)
1325+
})
1326+
1327+
it("should not truncate when maxInputTokens is not exceeded", async () => {
1328+
const modelInfo = createModelInfo(200000, 8192)
1329+
const messagesWithSmallContent = [
1330+
...messages.slice(0, -1),
1331+
{ ...messages[messages.length - 1], content: "" },
1332+
]
1333+
1334+
// Test with tokens below maxInputTokens limit
1335+
const result = await truncateConversationIfNeeded({
1336+
messages: messagesWithSmallContent,
1337+
totalTokens: 50000, // Below the maxInputTokens limit
1338+
contextWindow: modelInfo.contextWindow,
1339+
maxTokens: modelInfo.maxTokens,
1340+
maxInputTokens: 125000, // Gemini free tier limit
1341+
apiHandler: mockApiHandler,
1342+
autoCondenseContext: false,
1343+
autoCondenseContextPercent: 100,
1344+
systemPrompt: "test",
1345+
taskId,
1346+
profileThresholds: {},
1347+
currentProfileId: "default",
1348+
})
1349+
1350+
// Should not truncate because total tokens are below maxInputTokens
1351+
expect(result.messages).toEqual(messagesWithSmallContent)
1352+
expect(result.summary).toBe("")
1353+
expect(result.cost).toBe(0)
1354+
expect(result.prevContextTokens).toBe(50000)
1355+
})
12461356
})
12471357
})

src/core/sliding-window/index.ts

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ type TruncateOptions = {
6868
totalTokens: number
6969
contextWindow: number
7070
maxTokens?: number | null
71+
maxInputTokens?: number | null
7172
apiHandler: ApiHandler
7273
autoCondenseContext: boolean
7374
autoCondenseContextPercent: number
@@ -93,6 +94,7 @@ export async function truncateConversationIfNeeded({
9394
totalTokens,
9495
contextWindow,
9596
maxTokens,
97+
maxInputTokens,
9698
apiHandler,
9799
autoCondenseContext,
98100
autoCondenseContextPercent,
@@ -119,8 +121,16 @@ export async function truncateConversationIfNeeded({
119121
const prevContextTokens = totalTokens + lastMessageTokens
120122

121123
// Calculate available tokens for conversation history
122-
// Truncate if we're within TOKEN_BUFFER_PERCENTAGE of the context window
123-
const allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens
124+
// First check if there's a maxInputTokens limit (e.g., for Gemini free tier)
125+
let allowedTokens: number
126+
if (maxInputTokens && maxInputTokens > 0) {
127+
// Use the more restrictive limit between maxInputTokens and context window
128+
const contextWindowLimit = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens
129+
allowedTokens = Math.min(maxInputTokens, contextWindowLimit)
130+
} else {
131+
// Truncate if we're within TOKEN_BUFFER_PERCENTAGE of the context window
132+
allowedTokens = contextWindow * (1 - TOKEN_BUFFER_PERCENTAGE) - reservedTokens
133+
}
124134

125135
// Determine the effective threshold to use
126136
let effectiveThreshold = autoCondenseContextPercent

src/core/task/Task.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2456,6 +2456,7 @@ export class Task extends EventEmitter<TaskEvents> implements TaskLike {
24562456
messages: this.apiConversationHistory,
24572457
totalTokens: contextTokens || 0,
24582458
maxTokens,
2459+
maxInputTokens: this.apiConfiguration.maxInputTokens,
24592460
contextWindow,
24602461
apiHandler: this.api,
24612462
autoCondenseContext: true,
@@ -2571,6 +2572,7 @@ export class Task extends EventEmitter<TaskEvents> implements TaskLike {
25712572
messages: this.apiConversationHistory,
25722573
totalTokens: contextTokens,
25732574
maxTokens,
2575+
maxInputTokens: this.apiConfiguration.maxInputTokens,
25742576
contextWindow,
25752577
apiHandler: this.api,
25762578
autoCondenseContext,

0 commit comments

Comments
 (0)