fix: ensure user's max tokens setting overrides model defaults across all providers

MuriloFP · MuriloFP · commit e3a38831bfe0 · 2025-07-31T00:53:51.000-03:00
- Updated BaseOpenAiCompatibleProvider to use getModelMaxOutputTokens()
- Fixed ChutesHandler to respect user's custom max tokens
- Fixed LiteLLM createMessage and completePrompt methods
- Fixed Glama createMessage and completePrompt methods
- Fixed Unbound createMessage and completePrompt methods
- Fixed Mistral getModel method to use getModelMaxOutputTokens()
- Fixed XAI to use getModelMaxOutputTokens()
- Fixed OpenAI addMaxTokensIfNeeded to use getModelMaxOutputTokens()
- Fixed Gemini to use maxTokens from getModel() which already applies user settings

This ensures that when users set a custom max output tokens value in their provider settings, it will be respected across all providers (capped to the model's actual maximum).
diff --git a/src/api/providers/gemini.ts b/src/api/providers/gemini.ts
@@ -71,7 +71,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl
 			systemInstruction,
 			httpOptions: this.options.googleGeminiBaseUrl ? { baseUrl: this.options.googleGeminiBaseUrl } : undefined,
 			thinkingConfig,
-			maxOutputTokens: this.options.modelMaxTokens ?? maxTokens ?? undefined,
+			maxOutputTokens: maxTokens ?? undefined,
 			temperature: this.options.modelTemperature ?? 0,
 		}
 
diff --git a/src/api/providers/glama.ts b/src/api/providers/glama.ts
@@ -5,7 +5,7 @@ import OpenAI from "openai"
 import { glamaDefaultModelId, glamaDefaultModelInfo, GLAMA_DEFAULT_TEMPERATURE } from "@roo-code/types"
 
 import { Package } from "../../shared/package"
-import { ApiHandlerOptions } from "../../shared/api"
+import { ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/api"
 
 import { ApiStream } from "../transform/stream"
 import { convertToOpenAiMessages } from "../transform/openai-format"
@@ -49,12 +49,14 @@ export class GlamaHandler extends RouterProvider implements SingleCompletionHand
 			addCacheBreakpoints(systemPrompt, openAiMessages)
 		}
 
-		// Required by Anthropic; other providers default to max tokens allowed.
-		let maxTokens: number | undefined
-
-		if (modelId.startsWith("anthropic/")) {
-			maxTokens = info.maxTokens ?? undefined
-		}
+		// Use getModelMaxOutputTokens to respect user's custom max tokens setting
+		const maxTokens = modelId.startsWith("anthropic/")
+			? getModelMaxOutputTokens({
+					modelId,
+					model: info,
+					settings: this.options as any,
+				})
+			: undefined
 
 		const requestOptions: OpenAI.Chat.ChatCompletionCreateParams = {
 			model: modelId,
@@ -130,7 +132,11 @@ export class GlamaHandler extends RouterProvider implements SingleCompletionHand
 			}
 
 			if (modelId.startsWith("anthropic/")) {
-				requestOptions.max_tokens = info.maxTokens
+				requestOptions.max_tokens = getModelMaxOutputTokens({
+					modelId,
+					model: info,
+					settings: this.options as any,
+				})
 			}
 
 			const response = await this.client.chat.completions.create(requestOptions)
diff --git a/src/api/providers/lite-llm.ts b/src/api/providers/lite-llm.ts
@@ -5,7 +5,7 @@ import { litellmDefaultModelId, litellmDefaultModelInfo } from "@roo-code/types"
 
 import { calculateApiCostOpenAI } from "../../shared/cost"
 
-import { ApiHandlerOptions } from "../../shared/api"
+import { ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/api"
 
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { convertToOpenAiMessages } from "../transform/openai-format"
@@ -44,8 +44,12 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 			...convertToOpenAiMessages(messages),
 		]
 
-		// Required by some providers; others default to max tokens allowed
-		let maxTokens: number | undefined = info.maxTokens ?? undefined
+		// Use getModelMaxOutputTokens to respect user's custom max tokens setting
+		const maxTokens = getModelMaxOutputTokens({
+			modelId,
+			model: info,
+			settings: this.options as any,
+		})
 
 		const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
 			model: modelId,
@@ -119,7 +123,11 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 				requestOptions.temperature = this.options.modelTemperature ?? 0
 			}
 
-			requestOptions.max_tokens = info.maxTokens
+			requestOptions.max_tokens = getModelMaxOutputTokens({
+				modelId,
+				model: info,
+				settings: this.options as any,
+			})
 
 			const response = await this.client.chat.completions.create(requestOptions)
 			return response.choices[0]?.message.content || ""
diff --git a/src/api/providers/mistral.ts b/src/api/providers/mistral.ts
@@ -3,7 +3,7 @@ import { Mistral } from "@mistralai/mistralai"
 
 import { type MistralModelId, mistralDefaultModelId, mistralModels, MISTRAL_DEFAULT_TEMPERATURE } from "@roo-code/types"
 
-import { ApiHandlerOptions } from "../../shared/api"
+import { ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/api"
 
 import { convertToMistralMessages } from "../transform/mistral-format"
 import { ApiStream } from "../transform/stream"
@@ -78,7 +78,13 @@ export class MistralHandler extends BaseProvider implements SingleCompletionHand
 		const info = mistralModels[id as MistralModelId] ?? mistralModels[mistralDefaultModelId]
 
 		// @TODO: Move this to the `getModelParams` function.
-		const maxTokens = this.options.includeMaxTokens ? info.maxTokens : undefined
+		const maxTokens = this.options.includeMaxTokens
+			? getModelMaxOutputTokens({
+					modelId: id,
+					model: info,
+					settings: { ...this.options, apiProvider: "mistral" } as any,
+				})
+			: undefined
 		const temperature = this.options.modelTemperature ?? MISTRAL_DEFAULT_TEMPERATURE
 
 		return { id, info, maxTokens, temperature }
diff --git a/src/api/providers/openai.ts b/src/api/providers/openai.ts
@@ -10,7 +10,7 @@ import {
 	OPENAI_AZURE_AI_INFERENCE_PATH,
 } from "@roo-code/types"
 
-import type { ApiHandlerOptions } from "../../shared/api"
+import { ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/api"
 
 import { XmlMatcher } from "../../utils/xml-matcher"
 
@@ -403,9 +403,14 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 	): void {
 		// Only add max_completion_tokens if includeMaxTokens is true
 		if (this.options.includeMaxTokens === true) {
-			// Use user-configured modelMaxTokens if available, otherwise fall back to model's default maxTokens
+			// Use getModelMaxOutputTokens to properly handle user settings and model limits
 			// Using max_completion_tokens as max_tokens is deprecated
-			requestOptions.max_completion_tokens = this.options.modelMaxTokens || modelInfo.maxTokens
+			const modelId = this.options.openAiModelId ?? ""
+			requestOptions.max_completion_tokens = getModelMaxOutputTokens({
+				modelId,
+				model: modelInfo,
+				settings: { ...this.options, apiProvider: "openai" } as any,
+			})
 		}
 	}
 }
diff --git a/src/api/providers/unbound.ts b/src/api/providers/unbound.ts
@@ -4,6 +4,7 @@ import OpenAI from "openai"
 import { unboundDefaultModelId, unboundDefaultModelInfo } from "@roo-code/types"
 
 import type { ApiHandlerOptions } from "../../shared/api"
+import { getModelMaxOutputTokens } from "../../shared/api"
 
 import { ApiStream, ApiStreamUsageChunk } from "../transform/stream"
 import { convertToOpenAiMessages } from "../transform/openai-format"
@@ -76,12 +77,14 @@ export class UnboundHandler extends RouterProvider implements SingleCompletionHa
 			addVertexCacheBreakpoints(messages)
 		}
 
-		// Required by Anthropic; other providers default to max tokens allowed.
-		let maxTokens: number | undefined
-
-		if (modelId.startsWith("anthropic/")) {
-			maxTokens = info.maxTokens ?? undefined
-		}
+		// Use getModelMaxOutputTokens to respect user's custom max tokens setting
+		const maxTokens = modelId.startsWith("anthropic/")
+			? getModelMaxOutputTokens({
+					modelId,
+					model: info,
+					settings: this.options as any,
+				})
+			: undefined
 
 		const requestOptions: UnboundChatCompletionCreateParamsStreaming = {
 			model: modelId.split("/")[1],
@@ -149,7 +152,11 @@ export class UnboundHandler extends RouterProvider implements SingleCompletionHa
 			}
 
 			if (modelId.startsWith("anthropic/")) {
-				requestOptions.max_tokens = info.maxTokens
+				requestOptions.max_tokens = getModelMaxOutputTokens({
+					modelId,
+					model: info,
+					settings: this.options as any,
+				})
 			}
 
 			const response = await this.client.chat.completions.create(requestOptions, { headers: DEFAULT_HEADERS })
diff --git a/src/api/providers/xai.ts b/src/api/providers/xai.ts
@@ -3,7 +3,7 @@ import OpenAI from "openai"
 
 import { type XAIModelId, xaiDefaultModelId, xaiModels } from "@roo-code/types"
 
-import type { ApiHandlerOptions } from "../../shared/api"
+import { ApiHandlerOptions, getModelMaxOutputTokens } from "../../shared/api"
 
 import { ApiStream } from "../transform/stream"
 import { convertToOpenAiMessages } from "../transform/openai-format"
@@ -50,7 +50,11 @@ export class XAIHandler extends BaseProvider implements SingleCompletionHandler
 		// Use the OpenAI-compatible API.
 		const stream = await this.client.chat.completions.create({
 			model: modelId,
-			max_tokens: modelInfo.maxTokens,
+			max_tokens: getModelMaxOutputTokens({
+				modelId,
+				model: modelInfo,
+				settings: { ...this.options, apiProvider: "xai" } as any,
+			}),
 			temperature: this.options.modelTemperature ?? XAI_DEFAULT_TEMPERATURE,
 			messages: [{ role: "system", content: systemPrompt }, ...convertToOpenAiMessages(messages)],
 			stream: true,
@@ -78,12 +82,15 @@ export class XAIHandler extends BaseProvider implements SingleCompletionHandler
 			if (chunk.usage) {
 				// Extract detailed token information if available
 				// First check for prompt_tokens_details structure (real API response)
-				const promptDetails = "prompt_tokens_details" in chunk.usage ? chunk.usage.prompt_tokens_details : null;
-				const cachedTokens = promptDetails && "cached_tokens" in promptDetails ? promptDetails.cached_tokens : 0;
+				const promptDetails = "prompt_tokens_details" in chunk.usage ? chunk.usage.prompt_tokens_details : null
+				const cachedTokens = promptDetails && "cached_tokens" in promptDetails ? promptDetails.cached_tokens : 0
 
 				// Fall back to direct fields in usage (used in test mocks)
-				const readTokens = cachedTokens || ("cache_read_input_tokens" in chunk.usage ? (chunk.usage as any).cache_read_input_tokens : 0);
-				const writeTokens = "cache_creation_input_tokens" in chunk.usage ? (chunk.usage as any).cache_creation_input_tokens : 0;
+				const readTokens =
+					cachedTokens ||
+					("cache_read_input_tokens" in chunk.usage ? (chunk.usage as any).cache_read_input_tokens : 0)
+				const writeTokens =
+					"cache_creation_input_tokens" in chunk.usage ? (chunk.usage as any).cache_creation_input_tokens : 0
 
 				yield {
 					type: "usage",

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ export class GeminiHandler extends BaseProvider implements SingleCompletionHandl`
`71`	`71`	`systemInstruction,`
`72`	`72`	`httpOptions: this.options.googleGeminiBaseUrl ? { baseUrl: this.options.googleGeminiBaseUrl } : undefined,`
`73`	`73`	`thinkingConfig,`
`74`		`- maxOutputTokens: this.options.modelMaxTokens ?? maxTokens ?? undefined,`
	`74`	`+ maxOutputTokens: maxTokens ?? undefined,`
`75`	`75`	`temperature: this.options.modelTemperature ?? 0,`
`76`	`76`	`}`
`77`	`77`