refactor(litellm): centralize GPT-5 detection; expand variants; add undefined maxTokens guards and tests

daniel-lxs · daniel-lxs · commit f89b23ca635a · 2025-09-25T14:48:40.000-05:00
diff --git a/src/api/providers/__tests__/lite-llm.spec.ts b/src/api/providers/__tests__/lite-llm.spec.ts
@@ -34,6 +34,9 @@ vi.mock("../fetchers/modelCache", () => ({
 			"GPT-5": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 			"gpt-5-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 			"gpt5-preview": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"gpt-5o": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"gpt-5.1": { ...litellmDefaultModelInfo, maxTokens: 8192 },
+			"gpt-5-mini": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 			"gpt-4": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 			"claude-3-opus": { ...litellmDefaultModelInfo, maxTokens: 8192 },
 			"llama-3": { ...litellmDefaultModelInfo, maxTokens: 8192 },
@@ -200,7 +203,16 @@ describe("LiteLLMHandler", () => {
 		})
 
 		it("should use max_completion_tokens for various GPT-5 model variations", async () => {
-			const gpt5Variations = ["gpt-5", "gpt5", "GPT-5", "gpt-5-turbo", "gpt5-preview"]
+			const gpt5Variations = [
+				"gpt-5",
+				"gpt5",
+				"GPT-5",
+				"gpt-5-turbo",
+				"gpt5-preview",
+				"gpt-5o",
+				"gpt-5.1",
+				"gpt-5-mini",
+			]
 
 			for (const modelId of gpt5Variations) {
 				vi.clearAllMocks()
@@ -308,5 +320,72 @@ describe("LiteLLMHandler", () => {
 			expect(createCall.max_completion_tokens).toBeDefined()
 			expect(createCall.max_tokens).toBeUndefined()
 		})
+
+		it("should not set any max token fields when maxTokens is undefined (GPT-5 streaming)", async () => {
+			const optionsWithGPT5: ApiHandlerOptions = {
+				...mockOptions,
+				litellmModelId: "gpt-5",
+			}
+			handler = new LiteLLMHandler(optionsWithGPT5)
+
+			// Force fetchModel to return undefined maxTokens
+			vi.spyOn(handler as any, "fetchModel").mockResolvedValue({
+				id: "gpt-5",
+				info: { ...litellmDefaultModelInfo, maxTokens: undefined },
+			})
+
+			// Mock the stream response
+			const mockStream = {
+				async *[Symbol.asyncIterator]() {
+					yield {
+						choices: [{ delta: { content: "Hello!" } }],
+						usage: {
+							prompt_tokens: 10,
+							completion_tokens: 5,
+						},
+					}
+				},
+			}
+
+			mockCreate.mockReturnValue({
+				withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
+			})
+
+			const generator = handler.createMessage("You are a helpful assistant", [
+				{ role: "user", content: "Hello" } as unknown as Anthropic.Messages.MessageParam,
+			])
+			for await (const _chunk of generator) {
+				// consume
+			}
+
+			// Should not include either token field
+			const createCall = mockCreate.mock.calls[0][0]
+			expect(createCall.max_tokens).toBeUndefined()
+			expect(createCall.max_completion_tokens).toBeUndefined()
+		})
+
+		it("should not set any max token fields when maxTokens is undefined (GPT-5 completePrompt)", async () => {
+			const optionsWithGPT5: ApiHandlerOptions = {
+				...mockOptions,
+				litellmModelId: "gpt-5",
+			}
+			handler = new LiteLLMHandler(optionsWithGPT5)
+
+			// Force fetchModel to return undefined maxTokens
+			vi.spyOn(handler as any, "fetchModel").mockResolvedValue({
+				id: "gpt-5",
+				info: { ...litellmDefaultModelInfo, maxTokens: undefined },
+			})
+
+			mockCreate.mockResolvedValue({
+				choices: [{ message: { content: "Ok" } }],
+			})
+
+			await handler.completePrompt("Test prompt")
+
+			const createCall = mockCreate.mock.calls[0][0]
+			expect(createCall.max_tokens).toBeUndefined()
+			expect(createCall.max_completion_tokens).toBeUndefined()
+		})
 	})
 })
diff --git a/src/api/providers/lite-llm.ts b/src/api/providers/lite-llm.ts
@@ -32,6 +32,12 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 		})
 	}
 
+	private isGpt5(modelId: string): boolean {
+		// Match gpt-5, gpt5, and variants like gpt-5o, gpt-5-turbo, gpt5-preview, gpt-5.1
+		// Avoid matching gpt-50, gpt-500, etc.
+		return /\bgpt-?5(?!\d)/i.test(modelId)
+	}
+
 	override async *createMessage(
 		systemPrompt: string,
 		messages: Anthropic.Messages.MessageParam[],
@@ -108,7 +114,7 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 		let maxTokens: number | undefined = info.maxTokens ?? undefined
 
 		// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
-		const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
+		const isGPT5Model = this.isGpt5(modelId)
 
 		const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
 			model: modelId,
@@ -190,7 +196,7 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
 		const { id: modelId, info } = await this.fetchModel()
 
 		// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
-		const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
+		const isGPT5Model = this.isGpt5(modelId)
 
 		try {
 			const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {