fix: O3 model max_tokens support and code optimizations

AlexandruSmirnov · AlexandruSmirnov · commit 0b72b6882cce · 2025-06-09T20:00:44.000+03:00
- Fixed max_tokens support for O3 models in OpenAI provider
- Refactored OpenAI provider to eliminate code duplication with addMaxTokensIfNeeded helper
- Made Azure AI Inference Service respect the includeMaxTokens checkbox setting
- Applied code optimizations to reduce redundancy
- Added missing translations for includeMaxTokens in Catalan and German locales
- Updated tests to cover new functionality
diff --git a/src/api/providers/__tests__/openai.spec.ts b/src/api/providers/__tests__/openai.spec.ts
@@ -441,10 +441,13 @@ describe("OpenAiHandler", () => {
 					stream: true,
 					stream_options: { include_usage: true },
 					temperature: 0,
-					max_tokens: -1,
 				},
 				{ path: "/models/chat/completions" },
 			)
+
+			// Verify max_tokens is NOT included when includeMaxTokens is not set
+			const callArgs = mockCreate.mock.calls[0][0]
+			expect(callArgs).not.toHaveProperty("max_tokens")
 		})
 
 		it("should handle non-streaming responses with Azure AI Inference Service", async () => {
@@ -484,10 +487,13 @@ describe("OpenAiHandler", () => {
 						{ role: "user", content: systemPrompt },
 						{ role: "user", content: "Hello!" },
 					],
-					max_tokens: -1, // Default from openAiModelInfoSaneDefaults
 				},
 				{ path: "/models/chat/completions" },
 			)
+
+			// Verify max_tokens is NOT included when includeMaxTokens is not set
+			const callArgs = mockCreate.mock.calls[0][0]
+			expect(callArgs).not.toHaveProperty("max_tokens")
 		})
 
 		it("should handle completePrompt with Azure AI Inference Service", async () => {
@@ -498,10 +504,13 @@ describe("OpenAiHandler", () => {
 				{
 					model: azureOptions.openAiModelId,
 					messages: [{ role: "user", content: "Test prompt" }],
-					max_tokens: -1, // Default from openAiModelInfoSaneDefaults
 				},
 				{ path: "/models/chat/completions" },
 			)
+
+			// Verify max_tokens is NOT included when includeMaxTokens is not set
+			const callArgs = mockCreate.mock.calls[0][0]
+			expect(callArgs).not.toHaveProperty("max_tokens")
 		})
 	})
 
@@ -544,4 +553,223 @@ describe("OpenAiHandler", () => {
 			expect(lastCall[0]).not.toHaveProperty("stream_options")
 		})
 	})
+
+	describe("O3 Family Models", () => {
+		const o3Options = {
+			...mockOptions,
+			openAiModelId: "o3-mini",
+			openAiCustomModelInfo: {
+				contextWindow: 128_000,
+				maxTokens: 65536,
+				supportsPromptCache: false,
+				reasoningEffort: "medium" as "low" | "medium" | "high",
+			},
+		}
+
+		it("should handle O3 model with streaming and include max_tokens when includeMaxTokens is true", async () => {
+			const o3Handler = new OpenAiHandler({
+				...o3Options,
+				includeMaxTokens: true,
+				modelMaxTokens: 32000,
+				modelTemperature: 0.5,
+			})
+			const systemPrompt = "You are a helpful assistant."
+			const messages: Anthropic.Messages.MessageParam[] = [
+				{
+					role: "user",
+					content: "Hello!",
+				},
+			]
+
+			const stream = o3Handler.createMessage(systemPrompt, messages)
+			const chunks: any[] = []
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			expect(mockCreate).toHaveBeenCalledWith(
+				expect.objectContaining({
+					model: "o3-mini",
+					messages: [
+						{
+							role: "developer",
+							content: "Formatting re-enabled\nYou are a helpful assistant.",
+						},
+						{ role: "user", content: "Hello!" },
+					],
+					stream: true,
+					stream_options: { include_usage: true },
+					reasoning_effort: "medium",
+					temperature: 0.5,
+					max_tokens: 32000,
+				}),
+				{},
+			)
+		})
+
+		it("should handle O3 model with streaming and exclude max_tokens when includeMaxTokens is false", async () => {
+			const o3Handler = new OpenAiHandler({
+				...o3Options,
+				includeMaxTokens: false,
+				modelTemperature: 0.7,
+			})
+			const systemPrompt = "You are a helpful assistant."
+			const messages: Anthropic.Messages.MessageParam[] = [
+				{
+					role: "user",
+					content: "Hello!",
+				},
+			]
+
+			const stream = o3Handler.createMessage(systemPrompt, messages)
+			const chunks: any[] = []
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			expect(mockCreate).toHaveBeenCalledWith(
+				expect.objectContaining({
+					model: "o3-mini",
+					messages: [
+						{
+							role: "developer",
+							content: "Formatting re-enabled\nYou are a helpful assistant.",
+						},
+						{ role: "user", content: "Hello!" },
+					],
+					stream: true,
+					stream_options: { include_usage: true },
+					reasoning_effort: "medium",
+					temperature: 0.7,
+				}),
+				{},
+			)
+
+			// Verify max_tokens is NOT included
+			const callArgs = mockCreate.mock.calls[0][0]
+			expect(callArgs).not.toHaveProperty("max_tokens")
+		})
+
+		it("should handle O3 model non-streaming with max_tokens and reasoning_effort", async () => {
+			const o3Handler = new OpenAiHandler({
+				...o3Options,
+				openAiStreamingEnabled: false,
+				includeMaxTokens: true,
+				modelTemperature: 0.3,
+			})
+			const systemPrompt = "You are a helpful assistant."
+			const messages: Anthropic.Messages.MessageParam[] = [
+				{
+					role: "user",
+					content: "Hello!",
+				},
+			]
+
+			const stream = o3Handler.createMessage(systemPrompt, messages)
+			const chunks: any[] = []
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			expect(mockCreate).toHaveBeenCalledWith(
+				expect.objectContaining({
+					model: "o3-mini",
+					messages: [
+						{
+							role: "developer",
+							content: "Formatting re-enabled\nYou are a helpful assistant.",
+						},
+						{ role: "user", content: "Hello!" },
+					],
+					reasoning_effort: "medium",
+					temperature: 0.3,
+					max_tokens: 65536, // Falls back to model default
+				}),
+				{},
+			)
+
+			// Verify stream is not set
+			const callArgs = mockCreate.mock.calls[0][0]
+			expect(callArgs).not.toHaveProperty("stream")
+		})
+
+		it("should use default temperature of 0 when not specified for O3 models", async () => {
+			const o3Handler = new OpenAiHandler({
+				...o3Options,
+				// No modelTemperature specified
+			})
+			const systemPrompt = "You are a helpful assistant."
+			const messages: Anthropic.Messages.MessageParam[] = [
+				{
+					role: "user",
+					content: "Hello!",
+				},
+			]
+
+			const stream = o3Handler.createMessage(systemPrompt, messages)
+			await stream.next()
+
+			expect(mockCreate).toHaveBeenCalledWith(
+				expect.objectContaining({
+					temperature: 0, // Default temperature
+				}),
+				{},
+			)
+		})
+
+		it("should handle O3 model with Azure AI Inference Service respecting includeMaxTokens", async () => {
+			const o3AzureHandler = new OpenAiHandler({
+				...o3Options,
+				openAiBaseUrl: "https://test.services.ai.azure.com",
+				includeMaxTokens: false, // Should NOT include max_tokens
+			})
+			const systemPrompt = "You are a helpful assistant."
+			const messages: Anthropic.Messages.MessageParam[] = [
+				{
+					role: "user",
+					content: "Hello!",
+				},
+			]
+
+			const stream = o3AzureHandler.createMessage(systemPrompt, messages)
+			await stream.next()
+
+			expect(mockCreate).toHaveBeenCalledWith(
+				expect.objectContaining({
+					model: "o3-mini",
+				}),
+				{ path: "/models/chat/completions" },
+			)
+
+			// Verify max_tokens is NOT included when includeMaxTokens is false
+			const callArgs = mockCreate.mock.calls[0][0]
+			expect(callArgs).not.toHaveProperty("max_tokens")
+		})
+
+		it("should include max_tokens for O3 model with Azure AI Inference Service when includeMaxTokens is true", async () => {
+			const o3AzureHandler = new OpenAiHandler({
+				...o3Options,
+				openAiBaseUrl: "https://test.services.ai.azure.com",
+				includeMaxTokens: true, // Should include max_tokens
+			})
+			const systemPrompt = "You are a helpful assistant."
+			const messages: Anthropic.Messages.MessageParam[] = [
+				{
+					role: "user",
+					content: "Hello!",
+				},
+			]
+
+			const stream = o3AzureHandler.createMessage(systemPrompt, messages)
+			await stream.next()
+
+			expect(mockCreate).toHaveBeenCalledWith(
+				expect.objectContaining({
+					model: "o3-mini",
+					max_tokens: 65536, // Included when includeMaxTokens is true
+				}),
+				{ path: "/models/chat/completions" },
+			)
+		})
+	})
 })
diff --git a/src/api/providers/openai.ts b/src/api/providers/openai.ts
@@ -158,12 +158,8 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 				...(reasoning && reasoning),
 			}
 
-			// @TODO: Move this to the `getModelParams` function.
-			// Add max_tokens if specified or if using Azure AI Inference Service
-			if (this.options.includeMaxTokens === true || isAzureAiInference) {
-				// Use user-configured modelMaxTokens if available, otherwise fall back to model's default maxTokens
-				requestOptions.max_tokens = this.options.modelMaxTokens || modelInfo.maxTokens
-			}
+			// Add max_tokens if needed
+			this.addMaxTokensIfNeeded(requestOptions, modelInfo, isAzureAiInference)
 
 			const stream = await this.client.chat.completions.create(
 				requestOptions,
@@ -224,10 +220,8 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 						: [systemMessage, ...convertToOpenAiMessages(messages)],
 			}
 
-			// Add max_tokens if specified or if using Azure AI Inference Service
-			if (this.options.includeMaxTokens === true || isAzureAiInference) {
-				requestOptions.max_tokens = this.options.modelMaxTokens || modelInfo.maxTokens
-			}
+			// Add max_tokens if needed
+			this.addMaxTokensIfNeeded(requestOptions, modelInfo, isAzureAiInference)
 
 			const response = await this.client.chat.completions.create(
 				requestOptions,
@@ -263,17 +257,16 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 	async completePrompt(prompt: string): Promise<string> {
 		try {
 			const isAzureAiInference = this._isAzureAiInference(this.options.openAiBaseUrl)
-			const modelInfo = this.getModel().info
+			const model = this.getModel()
+			const modelInfo = model.info
 
 			const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
-				model: this.getModel().id,
+				model: model.id,
 				messages: [{ role: "user", content: prompt }],
 			}
 
-			// Add max_tokens if specified or if using Azure AI Inference Service
-			if (this.options.includeMaxTokens === true || isAzureAiInference) {
-				requestOptions.max_tokens = this.options.modelMaxTokens || modelInfo.maxTokens
-			}
+			// Add max_tokens if needed
+			this.addMaxTokensIfNeeded(requestOptions, modelInfo, isAzureAiInference)
 
 			const response = await this.client.chat.completions.create(
 				requestOptions,
@@ -312,9 +305,13 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 				],
 				stream: true,
 				...(isGrokXAI ? {} : { stream_options: { include_usage: true } }),
-				reasoning_effort: this.getModel().info.reasoningEffort,
+				reasoning_effort: modelInfo.reasoningEffort,
+				temperature: this.options.modelTemperature ?? 0,
 			}
 
+			// Add max_tokens if needed
+			this.addMaxTokensIfNeeded(requestOptions, modelInfo, methodIsAzureAiInference)
+
 			const stream = await this.client.chat.completions.create(
 				requestOptions,
 				methodIsAzureAiInference ? { path: OPENAI_AZURE_AI_INFERENCE_PATH } : {},
@@ -331,8 +328,13 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 					},
 					...convertToOpenAiMessages(messages),
 				],
+				reasoning_effort: modelInfo.reasoningEffort,
+				temperature: this.options.modelTemperature ?? 0,
 			}
 
+			// Add max_tokens if needed
+			this.addMaxTokensIfNeeded(requestOptions, modelInfo, methodIsAzureAiInference)
+
 			const response = await this.client.chat.completions.create(
 				requestOptions,
 				methodIsAzureAiInference ? { path: OPENAI_AZURE_AI_INFERENCE_PATH } : {},
@@ -383,6 +385,23 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl
 		const urlHost = this._getUrlHost(baseUrl)
 		return urlHost.endsWith(".services.ai.azure.com")
 	}
+
+	/**
+	 * Adds max_tokens to the request body if needed based on provider configuration
+	 */
+	private addMaxTokensIfNeeded(
+		requestOptions:
+			| OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming
+			| OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming,
+		modelInfo: ModelInfo,
+		isAzureAiInference: boolean,
+	): void {
+		// Only add max_tokens if includeMaxTokens is true
+		if (this.options.includeMaxTokens === true) {
+			// Use user-configured modelMaxTokens if available, otherwise fall back to model's default maxTokens
+			requestOptions.max_tokens = this.options.modelMaxTokens || modelInfo.maxTokens
+		}
+	}
 }
 
 export async function getOpenAiModels(baseUrl?: string, apiKey?: string, openAiHeaders?: Record<string, string>) {
diff --git a/webview-ui/src/i18n/locales/ca/settings.json b/webview-ui/src/i18n/locales/ca/settings.json
@@ -601,5 +601,7 @@
 	"labels": {
 		"customArn": "ARN personalitzat",
 		"useCustomArn": "Utilitza ARN personalitzat..."
-	}
+	},
+	"includeMaxOutputTokens": "Incloure tokens màxims de sortida",
+	"includeMaxOutputTokensDescription": "Enviar el paràmetre de tokens màxims de sortida a les sol·licituds API. Alguns proveïdors poden no admetre això."
 }
diff --git a/webview-ui/src/i18n/locales/de/settings.json b/webview-ui/src/i18n/locales/de/settings.json
@@ -601,5 +601,7 @@
 	"labels": {
 		"customArn": "Benutzerdefinierte ARN",
 		"useCustomArn": "Benutzerdefinierte ARN verwenden..."
-	}
+	},
+	"includeMaxOutputTokens": "Maximale Ausgabe-Tokens einbeziehen",
+	"includeMaxOutputTokensDescription": "Sende den Parameter für maximale Ausgabe-Tokens in API-Anfragen. Einige Anbieter unterstützen dies möglicherweise nicht."
 }

Original file line number	Diff line number	Diff line change
`@@ -601,5 +601,7 @@`
`601`	`601`	`"labels": {`
`602`	`602`	`"customArn": "ARN personalitzat",`
`603`	`603`	`"useCustomArn": "Utilitza ARN personalitzat..."`
`604`		`- }`
	`604`	`+ },`
	`605`	`+ "includeMaxOutputTokens": "Incloure tokens màxims de sortida",`
	`606`	`+ "includeMaxOutputTokensDescription": "Enviar el paràmetre de tokens màxims de sortida a les sol·licituds API. Alguns proveïdors poden no admetre això."`
`605`	`607`	`}`