feat: exclude GPT-5 models from 20% context window output token cap (#6963)

roomote[bot] · roomote · web-flow · commit 5e07bc42d0e5 · 2025-08-11T22:04:58.000-04:00
Co-authored-by: Roo Code &lt;roomote@roocode.com&gt;
diff --git a/src/shared/__tests__/api.spec.ts b/src/shared/__tests__/api.spec.ts
@@ -190,6 +190,95 @@ describe("getModelMaxOutputTokens", () => {
 		expect(result).toBe(20_000) // Should use model.maxTokens since it's exactly at 20%
 	})
 
+	test("should bypass 20% cap for GPT-5 models and use exact configured max tokens", () => {
+		const model: ModelInfo = {
+			contextWindow: 200_000,
+			supportsPromptCache: false,
+			maxTokens: 128_000, // 64% of context window, normally would be capped
+		}
+
+		const settings: ProviderSettings = {
+			apiProvider: "openai",
+		}
+
+		// Test various GPT-5 model IDs
+		const gpt5ModelIds = ["gpt-5", "gpt-5-turbo", "GPT-5", "openai/gpt-5-preview", "gpt-5-32k", "GPT-5-TURBO"]
+
+		gpt5ModelIds.forEach((modelId) => {
+			const result = getModelMaxOutputTokens({
+				modelId,
+				model,
+				settings,
+				format: "openai",
+			})
+			// Should use full 128k tokens, not capped to 20% (40k)
+			expect(result).toBe(128_000)
+		})
+	})
+
+	test("should still apply 20% cap to non-GPT-5 models", () => {
+		const model: ModelInfo = {
+			contextWindow: 200_000,
+			supportsPromptCache: false,
+			maxTokens: 128_000, // 64% of context window, should be capped
+		}
+
+		const settings: ProviderSettings = {
+			apiProvider: "openai",
+		}
+
+		// Test non-GPT-5 model IDs
+		const nonGpt5ModelIds = ["gpt-4", "gpt-4-turbo", "gpt-3.5-turbo", "claude-3-5-sonnet", "gemini-pro"]
+
+		nonGpt5ModelIds.forEach((modelId) => {
+			const result = getModelMaxOutputTokens({
+				modelId,
+				model,
+				settings,
+				format: "openai",
+			})
+			// Should be capped to 20% of context window: 200_000 * 0.2 = 40_000
+			expect(result).toBe(40_000)
+		})
+	})
+
+	test("should handle GPT-5 models with various max token configurations", () => {
+		const testCases = [
+			{
+				maxTokens: 128_000,
+				contextWindow: 200_000,
+				expected: 128_000, // Uses full 128k
+			},
+			{
+				maxTokens: 64_000,
+				contextWindow: 200_000,
+				expected: 64_000, // Uses configured 64k
+			},
+			{
+				maxTokens: 256_000,
+				contextWindow: 400_000,
+				expected: 256_000, // Uses full 256k even though it's 64% of context
+			},
+		]
+
+		testCases.forEach(({ maxTokens, contextWindow, expected }) => {
+			const model: ModelInfo = {
+				contextWindow,
+				supportsPromptCache: false,
+				maxTokens,
+			}
+
+			const result = getModelMaxOutputTokens({
+				modelId: "gpt-5-turbo",
+				model,
+				settings: { apiProvider: "openai" },
+				format: "openai",
+			})
+
+			expect(result).toBe(expected)
+		})
+	})
+
 	test("should return modelMaxTokens from settings when reasoning budget is required", () => {
 		const model: ModelInfo = {
 			contextWindow: 200_000,
diff --git a/src/shared/api.ts b/src/shared/api.ts
@@ -107,7 +107,17 @@ export const getModelMaxOutputTokens = ({
 	}
 
 	// If model has explicit maxTokens, clamp it to 20% of the context window
+	// Exception: GPT-5 models should use their exact configured max output tokens
 	if (model.maxTokens) {
+		// Check if this is a GPT-5 model (case-insensitive)
+		const isGpt5Model = modelId.toLowerCase().includes("gpt-5")
+
+		// GPT-5 models bypass the 20% cap and use their full configured max tokens
+		if (isGpt5Model) {
+			return model.maxTokens
+		}
+
+		// All other models are clamped to 20% of context window
 		return Math.min(model.maxTokens, Math.ceil(model.contextWindow * 0.2))
 	}