Skip to content

Commit c04e019

Browse files
committed
fix: use max_completion_tokens for GPT-5 models in LiteLLM provider
- GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter - Added detection for GPT-5 model variants (gpt-5, gpt5, GPT-5, etc.) - Updated both createMessage and completePrompt methods to handle GPT-5 models - Added comprehensive tests for GPT-5 model handling Fixes #6979
1 parent 5f3c67f commit c04e019

File tree

2 files changed

+187
-14
lines changed

2 files changed

+187
-14
lines changed

src/api/providers/__tests__/lite-llm.spec.ts

Lines changed: 166 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,9 @@ import { litellmDefaultModelId, litellmDefaultModelInfo } from "@roo-code/types"
1010
vi.mock("vscode", () => ({}))
1111

1212
// Mock OpenAI
13-
vi.mock("openai", () => {
14-
const mockStream = {
15-
[Symbol.asyncIterator]: vi.fn(),
16-
}
17-
18-
const mockCreate = vi.fn().mockReturnValue({
19-
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
20-
})
13+
const mockCreate = vi.fn()
2114

15+
vi.mock("openai", () => {
2216
return {
2317
default: vi.fn().mockImplementation(() => ({
2418
chat: {
@@ -35,14 +29,22 @@ vi.mock("../fetchers/modelCache", () => ({
3529
getModels: vi.fn().mockImplementation(() => {
3630
return Promise.resolve({
3731
[litellmDefaultModelId]: litellmDefaultModelInfo,
32+
"gpt-5": { ...litellmDefaultModelInfo, maxTokens: 8192 },
33+
gpt5: { ...litellmDefaultModelInfo, maxTokens: 8192 },
34+
"GPT-5": { ...litellmDefaultModelInfo, maxTokens: 8192 },
35+
"gpt-5-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 },
36+
"gpt5-preview": { ...litellmDefaultModelInfo, maxTokens: 8192 },
37+
"gpt-4": { ...litellmDefaultModelInfo, maxTokens: 8192 },
38+
"claude-3-opus": { ...litellmDefaultModelInfo, maxTokens: 8192 },
39+
"llama-3": { ...litellmDefaultModelInfo, maxTokens: 8192 },
40+
"gpt-4-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 },
3841
})
3942
}),
4043
}))
4144

4245
describe("LiteLLMHandler", () => {
4346
let handler: LiteLLMHandler
4447
let mockOptions: ApiHandlerOptions
45-
let mockOpenAIClient: any
4648

4749
beforeEach(() => {
4850
vi.clearAllMocks()
@@ -52,7 +54,6 @@ describe("LiteLLMHandler", () => {
5254
litellmModelId: litellmDefaultModelId,
5355
}
5456
handler = new LiteLLMHandler(mockOptions)
55-
mockOpenAIClient = new OpenAI()
5657
})
5758

5859
describe("prompt caching", () => {
@@ -85,7 +86,7 @@ describe("LiteLLMHandler", () => {
8586
},
8687
}
8788

88-
mockOpenAIClient.chat.completions.create.mockReturnValue({
89+
mockCreate.mockReturnValue({
8990
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
9091
})
9192

@@ -96,7 +97,7 @@ describe("LiteLLMHandler", () => {
9697
}
9798

9899
// Verify that create was called with cache control headers
99-
const createCall = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
100+
const createCall = mockCreate.mock.calls[0][0]
100101

101102
// Check system message has cache control in the proper format
102103
expect(createCall.messages[0]).toMatchObject({
@@ -155,4 +156,157 @@ describe("LiteLLMHandler", () => {
155156
})
156157
})
157158
})
159+
160+
describe("GPT-5 model handling", () => {
161+
it("should use max_completion_tokens instead of max_tokens for GPT-5 models", async () => {
162+
const optionsWithGPT5: ApiHandlerOptions = {
163+
...mockOptions,
164+
litellmModelId: "gpt-5",
165+
}
166+
handler = new LiteLLMHandler(optionsWithGPT5)
167+
168+
const systemPrompt = "You are a helpful assistant"
169+
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]
170+
171+
// Mock the stream response
172+
const mockStream = {
173+
async *[Symbol.asyncIterator]() {
174+
yield {
175+
choices: [{ delta: { content: "Hello!" } }],
176+
usage: {
177+
prompt_tokens: 10,
178+
completion_tokens: 5,
179+
},
180+
}
181+
},
182+
}
183+
184+
mockCreate.mockReturnValue({
185+
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
186+
})
187+
188+
const generator = handler.createMessage(systemPrompt, messages)
189+
const results = []
190+
for await (const chunk of generator) {
191+
results.push(chunk)
192+
}
193+
194+
// Verify that create was called with max_completion_tokens instead of max_tokens
195+
const createCall = mockCreate.mock.calls[0][0]
196+
197+
// Should have max_completion_tokens, not max_tokens
198+
expect(createCall.max_completion_tokens).toBeDefined()
199+
expect(createCall.max_tokens).toBeUndefined()
200+
})
201+
202+
it("should use max_completion_tokens for various GPT-5 model variations", async () => {
203+
const gpt5Variations = ["gpt-5", "gpt5", "GPT-5", "gpt-5-turbo", "gpt5-preview"]
204+
205+
for (const modelId of gpt5Variations) {
206+
vi.clearAllMocks()
207+
208+
const optionsWithGPT5: ApiHandlerOptions = {
209+
...mockOptions,
210+
litellmModelId: modelId,
211+
}
212+
handler = new LiteLLMHandler(optionsWithGPT5)
213+
214+
const systemPrompt = "You are a helpful assistant"
215+
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test" }]
216+
217+
// Mock the stream response
218+
const mockStream = {
219+
async *[Symbol.asyncIterator]() {
220+
yield {
221+
choices: [{ delta: { content: "Response" } }],
222+
usage: {
223+
prompt_tokens: 10,
224+
completion_tokens: 5,
225+
},
226+
}
227+
},
228+
}
229+
230+
mockCreate.mockReturnValue({
231+
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
232+
})
233+
234+
const generator = handler.createMessage(systemPrompt, messages)
235+
for await (const chunk of generator) {
236+
// Consume the generator
237+
}
238+
239+
// Verify that create was called with max_completion_tokens for this model variation
240+
const createCall = mockCreate.mock.calls[0][0]
241+
242+
expect(createCall.max_completion_tokens).toBeDefined()
243+
expect(createCall.max_tokens).toBeUndefined()
244+
}
245+
})
246+
247+
it("should still use max_tokens for non-GPT-5 models", async () => {
248+
const nonGPT5Models = ["gpt-4", "claude-3-opus", "llama-3", "gpt-4-turbo"]
249+
250+
for (const modelId of nonGPT5Models) {
251+
vi.clearAllMocks()
252+
253+
const options: ApiHandlerOptions = {
254+
...mockOptions,
255+
litellmModelId: modelId,
256+
}
257+
handler = new LiteLLMHandler(options)
258+
259+
const systemPrompt = "You are a helpful assistant"
260+
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test" }]
261+
262+
// Mock the stream response
263+
const mockStream = {
264+
async *[Symbol.asyncIterator]() {
265+
yield {
266+
choices: [{ delta: { content: "Response" } }],
267+
usage: {
268+
prompt_tokens: 10,
269+
completion_tokens: 5,
270+
},
271+
}
272+
},
273+
}
274+
275+
mockCreate.mockReturnValue({
276+
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
277+
})
278+
279+
const generator = handler.createMessage(systemPrompt, messages)
280+
for await (const chunk of generator) {
281+
// Consume the generator
282+
}
283+
284+
// Verify that create was called with max_tokens for non-GPT-5 models
285+
const createCall = mockCreate.mock.calls[0][0]
286+
287+
expect(createCall.max_tokens).toBeDefined()
288+
expect(createCall.max_completion_tokens).toBeUndefined()
289+
}
290+
})
291+
292+
it("should use max_completion_tokens in completePrompt for GPT-5 models", async () => {
293+
const optionsWithGPT5: ApiHandlerOptions = {
294+
...mockOptions,
295+
litellmModelId: "gpt-5",
296+
}
297+
handler = new LiteLLMHandler(optionsWithGPT5)
298+
299+
mockCreate.mockResolvedValue({
300+
choices: [{ message: { content: "Test response" } }],
301+
})
302+
303+
await handler.completePrompt("Test prompt")
304+
305+
// Verify that create was called with max_completion_tokens
306+
const createCall = mockCreate.mock.calls[0][0]
307+
308+
expect(createCall.max_completion_tokens).toBeDefined()
309+
expect(createCall.max_tokens).toBeUndefined()
310+
})
311+
})
158312
})

src/api/providers/lite-llm.ts

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,16 +107,26 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
107107
// Required by some providers; others default to max tokens allowed
108108
let maxTokens: number | undefined = info.maxTokens ?? undefined
109109

110+
// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
111+
const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
112+
110113
const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
111114
model: modelId,
112-
max_tokens: maxTokens,
113115
messages: [systemMessage, ...enhancedMessages],
114116
stream: true,
115117
stream_options: {
116118
include_usage: true,
117119
},
118120
}
119121

122+
// GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
123+
if (isGPT5Model && maxTokens) {
124+
// @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported
125+
requestOptions.max_completion_tokens = maxTokens
126+
} else if (maxTokens) {
127+
requestOptions.max_tokens = maxTokens
128+
}
129+
120130
if (this.supportsTemperature(modelId)) {
121131
requestOptions.temperature = this.options.modelTemperature ?? 0
122132
}
@@ -179,6 +189,9 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
179189
async completePrompt(prompt: string): Promise<string> {
180190
const { id: modelId, info } = await this.fetchModel()
181191

192+
// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
193+
const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
194+
182195
try {
183196
const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
184197
model: modelId,
@@ -189,7 +202,13 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
189202
requestOptions.temperature = this.options.modelTemperature ?? 0
190203
}
191204

192-
requestOptions.max_tokens = info.maxTokens
205+
// GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
206+
if (isGPT5Model && info.maxTokens) {
207+
// @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported
208+
requestOptions.max_completion_tokens = info.maxTokens
209+
} else if (info.maxTokens) {
210+
requestOptions.max_tokens = info.maxTokens
211+
}
193212

194213
const response = await this.client.chat.completions.create(requestOptions)
195214
return response.choices[0]?.message.content || ""

0 commit comments

Comments
 (0)