fix: respect includeMaxTokens option in BaseOpenAiCompatibleProvider

roomote · roomote · commit 1f205d40c951 · 2025-08-11T16:13:32.000Z
- Modified BaseOpenAiCompatibleProvider to only include max_tokens parameter when includeMaxTokens option is true - This fixes issue #6936 where Kimi K2 model output was being truncated to 1024 tokens - Updated tests for all providers that extend BaseOpenAiCompatibleProvider (groq, fireworks, chutes, sambanova, zai) - Added new test cases to verify max_tokens is not included by default and is included when includeMaxTokens is true Fixes #6936
diff --git a/src/api/providers/__tests__/chutes.spec.ts b/src/api/providers/__tests__/chutes.spec.ts
@@ -325,9 +325,8 @@ describe("ChutesHandler", () => {
 		)
 	})
 
-	it("createMessage should pass correct parameters to Chutes client for non-DeepSeek models", async () => {
+	it("createMessage should not include max_tokens by default for non-DeepSeek models", async () => {
 		const modelId: ChutesModelId = "unsloth/Llama-3.3-70B-Instruct"
-		const modelInfo = chutesModels[modelId]
 		const handlerWithModel = new ChutesHandler({ apiModelId: modelId, chutesApiKey: "test-chutes-api-key" })
 
 		mockCreate.mockImplementationOnce(() => {
@@ -346,6 +345,48 @@ describe("ChutesHandler", () => {
 		const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
 		await messageGenerator.next()
 
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.objectContaining({
+				model: modelId,
+				temperature: 0.5,
+				messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
+				stream: true,
+				stream_options: { include_usage: true },
+			}),
+		)
+		// Verify max_tokens is NOT included
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.not.objectContaining({
+				max_tokens: expect.anything(),
+			}),
+		)
+	})
+
+	it("createMessage should include max_tokens when includeMaxTokens is true for non-DeepSeek models", async () => {
+		const modelId: ChutesModelId = "unsloth/Llama-3.3-70B-Instruct"
+		const modelInfo = chutesModels[modelId]
+		const handlerWithModel = new ChutesHandler({
+			apiModelId: modelId,
+			chutesApiKey: "test-chutes-api-key",
+			includeMaxTokens: true,
+		})
+
+		mockCreate.mockImplementationOnce(() => {
+			return {
+				[Symbol.asyncIterator]: () => ({
+					async next() {
+						return { done: true }
+					},
+				}),
+			}
+		})
+
+		const systemPrompt = "Test system prompt for Chutes"
+		const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test message for Chutes" }]
+
+		const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
+		await messageGenerator.next()
+
 		expect(mockCreate).toHaveBeenCalledWith(
 			expect.objectContaining({
 				model: modelId,
diff --git a/src/api/providers/__tests__/fireworks.spec.ts b/src/api/providers/__tests__/fireworks.spec.ts
@@ -324,12 +324,53 @@ describe("FireworksHandler", () => {
 		expect(firstChunk.value).toEqual({ type: "usage", inputTokens: 10, outputTokens: 20 })
 	})
 
-	it("createMessage should pass correct parameters to Fireworks client", async () => {
+	it("createMessage should not include max_tokens by default", async () => {
+		const modelId: FireworksModelId = "accounts/fireworks/models/kimi-k2-instruct"
+		const handlerWithModel = new FireworksHandler({
+			apiModelId: modelId,
+			fireworksApiKey: "test-fireworks-api-key",
+		})
+
+		mockCreate.mockImplementationOnce(() => {
+			return {
+				[Symbol.asyncIterator]: () => ({
+					async next() {
+						return { done: true }
+					},
+				}),
+			}
+		})
+
+		const systemPrompt = "Test system prompt for Fireworks"
+		const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test message for Fireworks" }]
+
+		const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
+		await messageGenerator.next()
+
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.objectContaining({
+				model: modelId,
+				temperature: 0.5,
+				messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
+				stream: true,
+				stream_options: { include_usage: true },
+			}),
+		)
+		// Verify max_tokens is NOT included
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.not.objectContaining({
+				max_tokens: expect.anything(),
+			}),
+		)
+	})
+
+	it("createMessage should include max_tokens when includeMaxTokens is true", async () => {
 		const modelId: FireworksModelId = "accounts/fireworks/models/kimi-k2-instruct"
 		const modelInfo = fireworksModels[modelId]
 		const handlerWithModel = new FireworksHandler({
 			apiModelId: modelId,
 			fireworksApiKey: "test-fireworks-api-key",
+			includeMaxTokens: true,
 		})
 
 		mockCreate.mockImplementationOnce(() => {
diff --git a/src/api/providers/__tests__/groq.spec.ts b/src/api/providers/__tests__/groq.spec.ts
@@ -111,9 +111,8 @@ describe("GroqHandler", () => {
 		expect(firstChunk.value).toEqual({ type: "usage", inputTokens: 10, outputTokens: 20 })
 	})
 
-	it("createMessage should pass correct parameters to Groq client", async () => {
+	it("createMessage should not include max_tokens by default", async () => {
 		const modelId: GroqModelId = "llama-3.1-8b-instant"
-		const modelInfo = groqModels[modelId]
 		const handlerWithModel = new GroqHandler({ apiModelId: modelId, groqApiKey: "test-groq-api-key" })
 
 		mockCreate.mockImplementationOnce(() => {
@@ -132,6 +131,48 @@ describe("GroqHandler", () => {
 		const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
 		await messageGenerator.next()
 
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.objectContaining({
+				model: modelId,
+				temperature: 0.5,
+				messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
+				stream: true,
+				stream_options: { include_usage: true },
+			}),
+		)
+		// Verify max_tokens is NOT included
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.not.objectContaining({
+				max_tokens: expect.anything(),
+			}),
+		)
+	})
+
+	it("createMessage should include max_tokens when includeMaxTokens is true", async () => {
+		const modelId: GroqModelId = "llama-3.1-8b-instant"
+		const modelInfo = groqModels[modelId]
+		const handlerWithModel = new GroqHandler({
+			apiModelId: modelId,
+			groqApiKey: "test-groq-api-key",
+			includeMaxTokens: true,
+		})
+
+		mockCreate.mockImplementationOnce(() => {
+			return {
+				[Symbol.asyncIterator]: () => ({
+					async next() {
+						return { done: true }
+					},
+				}),
+			}
+		})
+
+		const systemPrompt = "Test system prompt for Groq"
+		const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test message for Groq" }]
+
+		const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
+		await messageGenerator.next()
+
 		expect(mockCreate).toHaveBeenCalledWith(
 			expect.objectContaining({
 				model: modelId,
@@ -143,4 +184,42 @@ describe("GroqHandler", () => {
 			}),
 		)
 	})
+
+	it("createMessage should use modelMaxTokens over default when includeMaxTokens is true", async () => {
+		const modelId: GroqModelId = "llama-3.1-8b-instant"
+		const customMaxTokens = 2048
+		const handlerWithModel = new GroqHandler({
+			apiModelId: modelId,
+			groqApiKey: "test-groq-api-key",
+			includeMaxTokens: true,
+			modelMaxTokens: customMaxTokens,
+		})
+
+		mockCreate.mockImplementationOnce(() => {
+			return {
+				[Symbol.asyncIterator]: () => ({
+					async next() {
+						return { done: true }
+					},
+				}),
+			}
+		})
+
+		const systemPrompt = "Test system prompt for Groq"
+		const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test message for Groq" }]
+
+		const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
+		await messageGenerator.next()
+
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.objectContaining({
+				model: modelId,
+				max_tokens: customMaxTokens,
+				temperature: 0.5,
+				messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
+				stream: true,
+				stream_options: { include_usage: true },
+			}),
+		)
+	})
 })
diff --git a/src/api/providers/__tests__/sambanova.spec.ts b/src/api/providers/__tests__/sambanova.spec.ts
@@ -116,12 +116,53 @@ describe("SambaNovaHandler", () => {
 		expect(firstChunk.value).toEqual({ type: "usage", inputTokens: 10, outputTokens: 20 })
 	})
 
-	it("createMessage should pass correct parameters to SambaNova client", async () => {
+	it("createMessage should not include max_tokens by default", async () => {
+		const modelId: SambaNovaModelId = "Meta-Llama-3.3-70B-Instruct"
+		const handlerWithModel = new SambaNovaHandler({
+			apiModelId: modelId,
+			sambaNovaApiKey: "test-sambanova-api-key",
+		})
+
+		mockCreate.mockImplementationOnce(() => {
+			return {
+				[Symbol.asyncIterator]: () => ({
+					async next() {
+						return { done: true }
+					},
+				}),
+			}
+		})
+
+		const systemPrompt = "Test system prompt for SambaNova"
+		const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test message for SambaNova" }]
+
+		const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
+		await messageGenerator.next()
+
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.objectContaining({
+				model: modelId,
+				temperature: 0.7,
+				messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
+				stream: true,
+				stream_options: { include_usage: true },
+			}),
+		)
+		// Verify max_tokens is NOT included
+		expect(mockCreate).toHaveBeenCalledWith(
+			expect.not.objectContaining({
+				max_tokens: expect.anything(),
+			}),
+		)
+	})
+
+	it("createMessage should include max_tokens when includeMaxTokens is true", async () => {
 		const modelId: SambaNovaModelId = "Meta-Llama-3.3-70B-Instruct"
 		const modelInfo = sambaNovaModels[modelId]
 		const handlerWithModel = new SambaNovaHandler({
 			apiModelId: modelId,
 			sambaNovaApiKey: "test-sambanova-api-key",
+			includeMaxTokens: true,
 		})
 
 		mockCreate.mockImplementationOnce(() => {
diff --git a/src/api/providers/__tests__/zai.spec.ts b/src/api/providers/__tests__/zai.spec.ts
@@ -191,13 +191,55 @@ describe("ZAiHandler", () => {
 			expect(firstChunk.value).toEqual({ type: "usage", inputTokens: 10, outputTokens: 20 })
 		})
 
-		it("createMessage should pass correct parameters to Z AI client", async () => {
+		it("createMessage should not include max_tokens by default", async () => {
+			const modelId: InternationalZAiModelId = "glm-4.5"
+			const handlerWithModel = new ZAiHandler({
+				apiModelId: modelId,
+				zaiApiKey: "test-zai-api-key",
+				zaiApiLine: "international",
+			})
+
+			mockCreate.mockImplementationOnce(() => {
+				return {
+					[Symbol.asyncIterator]: () => ({
+						async next() {
+							return { done: true }
+						},
+					}),
+				}
+			})
+
+			const systemPrompt = "Test system prompt for Z AI"
+			const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test message for Z AI" }]
+
+			const messageGenerator = handlerWithModel.createMessage(systemPrompt, messages)
+			await messageGenerator.next()
+
+			expect(mockCreate).toHaveBeenCalledWith(
+				expect.objectContaining({
+					model: modelId,
+					temperature: ZAI_DEFAULT_TEMPERATURE,
+					messages: expect.arrayContaining([{ role: "system", content: systemPrompt }]),
+					stream: true,
+					stream_options: { include_usage: true },
+				}),
+			)
+			// Verify max_tokens is NOT included
+			expect(mockCreate).toHaveBeenCalledWith(
+				expect.not.objectContaining({
+					max_tokens: expect.anything(),
+				}),
+			)
+		})
+
+		it("createMessage should include max_tokens when includeMaxTokens is true", async () => {
 			const modelId: InternationalZAiModelId = "glm-4.5"
 			const modelInfo = internationalZAiModels[modelId]
 			const handlerWithModel = new ZAiHandler({
 				apiModelId: modelId,
 				zaiApiKey: "test-zai-api-key",
 				zaiApiLine: "international",
+				includeMaxTokens: true,
 			})
 
 			mockCreate.mockImplementationOnce(() => {
diff --git a/src/api/providers/base-openai-compatible-provider.ts b/src/api/providers/base-openai-compatible-provider.ts
@@ -67,22 +67,24 @@ export abstract class BaseOpenAiCompatibleProvider<ModelName extends string>
 		messages: Anthropic.Messages.MessageParam[],
 		metadata?: ApiHandlerCreateMessageMetadata,
 	): ApiStream {
-		const {
-			id: model,
-			info: { maxTokens: max_tokens },
-		} = this.getModel()
+		const { id: model, info } = this.getModel()
 
 		const temperature = this.options.modelTemperature ?? this.defaultTemperature
 
 		const params: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
 			model,
-			max_tokens,
 			temperature,
 			messages: [{ role: "system", content: systemPrompt }, ...convertToOpenAiMessages(messages)],
 			stream: true,
 			stream_options: { include_usage: true },
 		}
 
+		// Only add max_tokens if includeMaxTokens is true
+		if (this.options.includeMaxTokens === true) {
+			// Use user-configured modelMaxTokens if available, otherwise fall back to model's default maxTokens
+			params.max_tokens = this.options.modelMaxTokens || info.maxTokens
+		}
+
 		const stream = await this.client.chat.completions.create(params)
 
 		for await (const chunk of stream) {