diff --git a/src/shared/__tests__/api.spec.ts b/src/shared/__tests__/api.spec.ts index 80c3db1b7d7..a56fea8e14f 100644 --- a/src/shared/__tests__/api.spec.ts +++ b/src/shared/__tests__/api.spec.ts @@ -190,6 +190,95 @@ describe("getModelMaxOutputTokens", () => { expect(result).toBe(20_000) // Should use model.maxTokens since it's exactly at 20% }) + test("should bypass 20% cap for GPT-5 models and use exact configured max tokens", () => { + const model: ModelInfo = { + contextWindow: 200_000, + supportsPromptCache: false, + maxTokens: 128_000, // 64% of context window, normally would be capped + } + + const settings: ProviderSettings = { + apiProvider: "openai", + } + + // Test various GPT-5 model IDs + const gpt5ModelIds = ["gpt-5", "gpt-5-turbo", "GPT-5", "openai/gpt-5-preview", "gpt-5-32k", "GPT-5-TURBO"] + + gpt5ModelIds.forEach((modelId) => { + const result = getModelMaxOutputTokens({ + modelId, + model, + settings, + format: "openai", + }) + // Should use full 128k tokens, not capped to 20% (40k) + expect(result).toBe(128_000) + }) + }) + + test("should still apply 20% cap to non-GPT-5 models", () => { + const model: ModelInfo = { + contextWindow: 200_000, + supportsPromptCache: false, + maxTokens: 128_000, // 64% of context window, should be capped + } + + const settings: ProviderSettings = { + apiProvider: "openai", + } + + // Test non-GPT-5 model IDs + const nonGpt5ModelIds = ["gpt-4", "gpt-4-turbo", "gpt-3.5-turbo", "claude-3-5-sonnet", "gemini-pro"] + + nonGpt5ModelIds.forEach((modelId) => { + const result = getModelMaxOutputTokens({ + modelId, + model, + settings, + format: "openai", + }) + // Should be capped to 20% of context window: 200_000 * 0.2 = 40_000 + expect(result).toBe(40_000) + }) + }) + + test("should handle GPT-5 models with various max token configurations", () => { + const testCases = [ + { + maxTokens: 128_000, + contextWindow: 200_000, + expected: 128_000, // Uses full 128k + }, + { + maxTokens: 64_000, + contextWindow: 200_000, + expected: 64_000, // Uses configured 64k + }, + { + maxTokens: 256_000, + contextWindow: 400_000, + expected: 256_000, // Uses full 256k even though it's 64% of context + }, + ] + + testCases.forEach(({ maxTokens, contextWindow, expected }) => { + const model: ModelInfo = { + contextWindow, + supportsPromptCache: false, + maxTokens, + } + + const result = getModelMaxOutputTokens({ + modelId: "gpt-5-turbo", + model, + settings: { apiProvider: "openai" }, + format: "openai", + }) + + expect(result).toBe(expected) + }) + }) + test("should return modelMaxTokens from settings when reasoning budget is required", () => { const model: ModelInfo = { contextWindow: 200_000, diff --git a/src/shared/api.ts b/src/shared/api.ts index 01f8fa2dbf9..f1bf7dbaea4 100644 --- a/src/shared/api.ts +++ b/src/shared/api.ts @@ -107,7 +107,17 @@ export const getModelMaxOutputTokens = ({ } // If model has explicit maxTokens, clamp it to 20% of the context window + // Exception: GPT-5 models should use their exact configured max output tokens if (model.maxTokens) { + // Check if this is a GPT-5 model (case-insensitive) + const isGpt5Model = modelId.toLowerCase().includes("gpt-5") + + // GPT-5 models bypass the 20% cap and use their full configured max tokens + if (isGpt5Model) { + return model.maxTokens + } + + // All other models are clamped to 20% of context window return Math.min(model.maxTokens, Math.ceil(model.contextWindow * 0.2)) }