Skip to content

Commit 602901f

Browse files
roomote[bot]roomotedaniel-lxs
authored
fix: use max_completion_tokens for GPT-5 models in LiteLLM provider (#6980)
Co-authored-by: Roo Code <[email protected]> Co-authored-by: daniel-lxs <[email protected]>
1 parent b5c58b6 commit 602901f

File tree

2 files changed

+270
-14
lines changed

2 files changed

+270
-14
lines changed

src/api/providers/__tests__/lite-llm.spec.ts

Lines changed: 245 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,9 @@ import { litellmDefaultModelId, litellmDefaultModelInfo } from "@roo-code/types"
99
vi.mock("vscode", () => ({}))
1010

1111
// Mock OpenAI
12-
vi.mock("openai", () => {
13-
const mockStream = {
14-
[Symbol.asyncIterator]: vi.fn(),
15-
}
16-
17-
const mockCreate = vi.fn().mockReturnValue({
18-
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
19-
})
12+
const mockCreate = vi.fn()
2013

14+
vi.mock("openai", () => {
2115
return {
2216
default: vi.fn().mockImplementation(() => ({
2317
chat: {
@@ -34,14 +28,25 @@ vi.mock("../fetchers/modelCache", () => ({
3428
getModels: vi.fn().mockImplementation(() => {
3529
return Promise.resolve({
3630
[litellmDefaultModelId]: litellmDefaultModelInfo,
31+
"gpt-5": { ...litellmDefaultModelInfo, maxTokens: 8192 },
32+
gpt5: { ...litellmDefaultModelInfo, maxTokens: 8192 },
33+
"GPT-5": { ...litellmDefaultModelInfo, maxTokens: 8192 },
34+
"gpt-5-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 },
35+
"gpt5-preview": { ...litellmDefaultModelInfo, maxTokens: 8192 },
36+
"gpt-5o": { ...litellmDefaultModelInfo, maxTokens: 8192 },
37+
"gpt-5.1": { ...litellmDefaultModelInfo, maxTokens: 8192 },
38+
"gpt-5-mini": { ...litellmDefaultModelInfo, maxTokens: 8192 },
39+
"gpt-4": { ...litellmDefaultModelInfo, maxTokens: 8192 },
40+
"claude-3-opus": { ...litellmDefaultModelInfo, maxTokens: 8192 },
41+
"llama-3": { ...litellmDefaultModelInfo, maxTokens: 8192 },
42+
"gpt-4-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 },
3743
})
3844
}),
3945
}))
4046

4147
describe("LiteLLMHandler", () => {
4248
let handler: LiteLLMHandler
4349
let mockOptions: ApiHandlerOptions
44-
let mockOpenAIClient: any
4550

4651
beforeEach(() => {
4752
vi.clearAllMocks()
@@ -51,7 +56,6 @@ describe("LiteLLMHandler", () => {
5156
litellmModelId: litellmDefaultModelId,
5257
}
5358
handler = new LiteLLMHandler(mockOptions)
54-
mockOpenAIClient = new OpenAI()
5559
})
5660

5761
describe("prompt caching", () => {
@@ -84,7 +88,7 @@ describe("LiteLLMHandler", () => {
8488
},
8589
}
8690

87-
mockOpenAIClient.chat.completions.create.mockReturnValue({
91+
mockCreate.mockReturnValue({
8892
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
8993
})
9094

@@ -95,7 +99,7 @@ describe("LiteLLMHandler", () => {
9599
}
96100

97101
// Verify that create was called with cache control headers
98-
const createCall = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
102+
const createCall = mockCreate.mock.calls[0][0]
99103

100104
// Check system message has cache control in the proper format
101105
expect(createCall.messages[0]).toMatchObject({
@@ -154,4 +158,233 @@ describe("LiteLLMHandler", () => {
154158
})
155159
})
156160
})
161+
162+
describe("GPT-5 model handling", () => {
163+
it("should use max_completion_tokens instead of max_tokens for GPT-5 models", async () => {
164+
const optionsWithGPT5: ApiHandlerOptions = {
165+
...mockOptions,
166+
litellmModelId: "gpt-5",
167+
}
168+
handler = new LiteLLMHandler(optionsWithGPT5)
169+
170+
const systemPrompt = "You are a helpful assistant"
171+
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]
172+
173+
// Mock the stream response
174+
const mockStream = {
175+
async *[Symbol.asyncIterator]() {
176+
yield {
177+
choices: [{ delta: { content: "Hello!" } }],
178+
usage: {
179+
prompt_tokens: 10,
180+
completion_tokens: 5,
181+
},
182+
}
183+
},
184+
}
185+
186+
mockCreate.mockReturnValue({
187+
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
188+
})
189+
190+
const generator = handler.createMessage(systemPrompt, messages)
191+
const results = []
192+
for await (const chunk of generator) {
193+
results.push(chunk)
194+
}
195+
196+
// Verify that create was called with max_completion_tokens instead of max_tokens
197+
const createCall = mockCreate.mock.calls[0][0]
198+
199+
// Should have max_completion_tokens, not max_tokens
200+
expect(createCall.max_completion_tokens).toBeDefined()
201+
expect(createCall.max_tokens).toBeUndefined()
202+
})
203+
204+
it("should use max_completion_tokens for various GPT-5 model variations", async () => {
205+
const gpt5Variations = [
206+
"gpt-5",
207+
"gpt5",
208+
"GPT-5",
209+
"gpt-5-turbo",
210+
"gpt5-preview",
211+
"gpt-5o",
212+
"gpt-5.1",
213+
"gpt-5-mini",
214+
]
215+
216+
for (const modelId of gpt5Variations) {
217+
vi.clearAllMocks()
218+
219+
const optionsWithGPT5: ApiHandlerOptions = {
220+
...mockOptions,
221+
litellmModelId: modelId,
222+
}
223+
handler = new LiteLLMHandler(optionsWithGPT5)
224+
225+
const systemPrompt = "You are a helpful assistant"
226+
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test" }]
227+
228+
// Mock the stream response
229+
const mockStream = {
230+
async *[Symbol.asyncIterator]() {
231+
yield {
232+
choices: [{ delta: { content: "Response" } }],
233+
usage: {
234+
prompt_tokens: 10,
235+
completion_tokens: 5,
236+
},
237+
}
238+
},
239+
}
240+
241+
mockCreate.mockReturnValue({
242+
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
243+
})
244+
245+
const generator = handler.createMessage(systemPrompt, messages)
246+
for await (const chunk of generator) {
247+
// Consume the generator
248+
}
249+
250+
// Verify that create was called with max_completion_tokens for this model variation
251+
const createCall = mockCreate.mock.calls[0][0]
252+
253+
expect(createCall.max_completion_tokens).toBeDefined()
254+
expect(createCall.max_tokens).toBeUndefined()
255+
}
256+
})
257+
258+
it("should still use max_tokens for non-GPT-5 models", async () => {
259+
const nonGPT5Models = ["gpt-4", "claude-3-opus", "llama-3", "gpt-4-turbo"]
260+
261+
for (const modelId of nonGPT5Models) {
262+
vi.clearAllMocks()
263+
264+
const options: ApiHandlerOptions = {
265+
...mockOptions,
266+
litellmModelId: modelId,
267+
}
268+
handler = new LiteLLMHandler(options)
269+
270+
const systemPrompt = "You are a helpful assistant"
271+
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test" }]
272+
273+
// Mock the stream response
274+
const mockStream = {
275+
async *[Symbol.asyncIterator]() {
276+
yield {
277+
choices: [{ delta: { content: "Response" } }],
278+
usage: {
279+
prompt_tokens: 10,
280+
completion_tokens: 5,
281+
},
282+
}
283+
},
284+
}
285+
286+
mockCreate.mockReturnValue({
287+
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
288+
})
289+
290+
const generator = handler.createMessage(systemPrompt, messages)
291+
for await (const chunk of generator) {
292+
// Consume the generator
293+
}
294+
295+
// Verify that create was called with max_tokens for non-GPT-5 models
296+
const createCall = mockCreate.mock.calls[0][0]
297+
298+
expect(createCall.max_tokens).toBeDefined()
299+
expect(createCall.max_completion_tokens).toBeUndefined()
300+
}
301+
})
302+
303+
it("should use max_completion_tokens in completePrompt for GPT-5 models", async () => {
304+
const optionsWithGPT5: ApiHandlerOptions = {
305+
...mockOptions,
306+
litellmModelId: "gpt-5",
307+
}
308+
handler = new LiteLLMHandler(optionsWithGPT5)
309+
310+
mockCreate.mockResolvedValue({
311+
choices: [{ message: { content: "Test response" } }],
312+
})
313+
314+
await handler.completePrompt("Test prompt")
315+
316+
// Verify that create was called with max_completion_tokens
317+
const createCall = mockCreate.mock.calls[0][0]
318+
319+
expect(createCall.max_completion_tokens).toBeDefined()
320+
expect(createCall.max_tokens).toBeUndefined()
321+
})
322+
323+
it("should not set any max token fields when maxTokens is undefined (GPT-5 streaming)", async () => {
324+
const optionsWithGPT5: ApiHandlerOptions = {
325+
...mockOptions,
326+
litellmModelId: "gpt-5",
327+
}
328+
handler = new LiteLLMHandler(optionsWithGPT5)
329+
330+
// Force fetchModel to return undefined maxTokens
331+
vi.spyOn(handler as any, "fetchModel").mockResolvedValue({
332+
id: "gpt-5",
333+
info: { ...litellmDefaultModelInfo, maxTokens: undefined },
334+
})
335+
336+
// Mock the stream response
337+
const mockStream = {
338+
async *[Symbol.asyncIterator]() {
339+
yield {
340+
choices: [{ delta: { content: "Hello!" } }],
341+
usage: {
342+
prompt_tokens: 10,
343+
completion_tokens: 5,
344+
},
345+
}
346+
},
347+
}
348+
349+
mockCreate.mockReturnValue({
350+
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
351+
})
352+
353+
const generator = handler.createMessage("You are a helpful assistant", [
354+
{ role: "user", content: "Hello" } as unknown as Anthropic.Messages.MessageParam,
355+
])
356+
for await (const _chunk of generator) {
357+
// consume
358+
}
359+
360+
// Should not include either token field
361+
const createCall = mockCreate.mock.calls[0][0]
362+
expect(createCall.max_tokens).toBeUndefined()
363+
expect(createCall.max_completion_tokens).toBeUndefined()
364+
})
365+
366+
it("should not set any max token fields when maxTokens is undefined (GPT-5 completePrompt)", async () => {
367+
const optionsWithGPT5: ApiHandlerOptions = {
368+
...mockOptions,
369+
litellmModelId: "gpt-5",
370+
}
371+
handler = new LiteLLMHandler(optionsWithGPT5)
372+
373+
// Force fetchModel to return undefined maxTokens
374+
vi.spyOn(handler as any, "fetchModel").mockResolvedValue({
375+
id: "gpt-5",
376+
info: { ...litellmDefaultModelInfo, maxTokens: undefined },
377+
})
378+
379+
mockCreate.mockResolvedValue({
380+
choices: [{ message: { content: "Ok" } }],
381+
})
382+
383+
await handler.completePrompt("Test prompt")
384+
385+
const createCall = mockCreate.mock.calls[0][0]
386+
expect(createCall.max_tokens).toBeUndefined()
387+
expect(createCall.max_completion_tokens).toBeUndefined()
388+
})
389+
})
157390
})

src/api/providers/lite-llm.ts

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
3232
})
3333
}
3434

35+
private isGpt5(modelId: string): boolean {
36+
// Match gpt-5, gpt5, and variants like gpt-5o, gpt-5-turbo, gpt5-preview, gpt-5.1
37+
// Avoid matching gpt-50, gpt-500, etc.
38+
return /\bgpt-?5(?!\d)/i.test(modelId)
39+
}
40+
3541
override async *createMessage(
3642
systemPrompt: string,
3743
messages: Anthropic.Messages.MessageParam[],
@@ -107,16 +113,25 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
107113
// Required by some providers; others default to max tokens allowed
108114
let maxTokens: number | undefined = info.maxTokens ?? undefined
109115

116+
// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
117+
const isGPT5Model = this.isGpt5(modelId)
118+
110119
const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
111120
model: modelId,
112-
max_tokens: maxTokens,
113121
messages: [systemMessage, ...enhancedMessages],
114122
stream: true,
115123
stream_options: {
116124
include_usage: true,
117125
},
118126
}
119127

128+
// GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
129+
if (isGPT5Model && maxTokens) {
130+
requestOptions.max_completion_tokens = maxTokens
131+
} else if (maxTokens) {
132+
requestOptions.max_tokens = maxTokens
133+
}
134+
120135
if (this.supportsTemperature(modelId)) {
121136
requestOptions.temperature = this.options.modelTemperature ?? 0
122137
}
@@ -179,6 +194,9 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
179194
async completePrompt(prompt: string): Promise<string> {
180195
const { id: modelId, info } = await this.fetchModel()
181196

197+
// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
198+
const isGPT5Model = this.isGpt5(modelId)
199+
182200
try {
183201
const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
184202
model: modelId,
@@ -189,7 +207,12 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
189207
requestOptions.temperature = this.options.modelTemperature ?? 0
190208
}
191209

192-
requestOptions.max_tokens = info.maxTokens
210+
// GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
211+
if (isGPT5Model && info.maxTokens) {
212+
requestOptions.max_completion_tokens = info.maxTokens
213+
} else if (info.maxTokens) {
214+
requestOptions.max_tokens = info.maxTokens
215+
}
193216

194217
const response = await this.client.chat.completions.create(requestOptions)
195218
return response.choices[0]?.message.content || ""

0 commit comments

Comments
 (0)