Skip to content

fix: use max_completion_tokens for GPT-5 models in LiteLLM provider #6980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
178 changes: 166 additions & 12 deletions src/api/providers/__tests__/lite-llm.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,9 @@ import { litellmDefaultModelId, litellmDefaultModelInfo } from "@roo-code/types"
vi.mock("vscode", () => ({}))

// Mock OpenAI
vi.mock("openai", () => {
const mockStream = {
[Symbol.asyncIterator]: vi.fn(),
}

const mockCreate = vi.fn().mockReturnValue({
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
})
const mockCreate = vi.fn()

vi.mock("openai", () => {
return {
default: vi.fn().mockImplementation(() => ({
chat: {
Expand All @@ -35,14 +29,22 @@ vi.mock("../fetchers/modelCache", () => ({
getModels: vi.fn().mockImplementation(() => {
return Promise.resolve({
[litellmDefaultModelId]: litellmDefaultModelInfo,
"gpt-5": { ...litellmDefaultModelInfo, maxTokens: 8192 },
gpt5: { ...litellmDefaultModelInfo, maxTokens: 8192 },
"GPT-5": { ...litellmDefaultModelInfo, maxTokens: 8192 },
"gpt-5-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 },
"gpt5-preview": { ...litellmDefaultModelInfo, maxTokens: 8192 },
"gpt-4": { ...litellmDefaultModelInfo, maxTokens: 8192 },
"claude-3-opus": { ...litellmDefaultModelInfo, maxTokens: 8192 },
"llama-3": { ...litellmDefaultModelInfo, maxTokens: 8192 },
"gpt-4-turbo": { ...litellmDefaultModelInfo, maxTokens: 8192 },
})
}),
}))

describe("LiteLLMHandler", () => {
let handler: LiteLLMHandler
let mockOptions: ApiHandlerOptions
let mockOpenAIClient: any

beforeEach(() => {
vi.clearAllMocks()
Expand All @@ -52,7 +54,6 @@ describe("LiteLLMHandler", () => {
litellmModelId: litellmDefaultModelId,
}
handler = new LiteLLMHandler(mockOptions)
mockOpenAIClient = new OpenAI()
})

describe("prompt caching", () => {
Expand Down Expand Up @@ -85,7 +86,7 @@ describe("LiteLLMHandler", () => {
},
}

mockOpenAIClient.chat.completions.create.mockReturnValue({
mockCreate.mockReturnValue({
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
})

Expand All @@ -96,7 +97,7 @@ describe("LiteLLMHandler", () => {
}

// Verify that create was called with cache control headers
const createCall = mockOpenAIClient.chat.completions.create.mock.calls[0][0]
const createCall = mockCreate.mock.calls[0][0]

// Check system message has cache control in the proper format
expect(createCall.messages[0]).toMatchObject({
Expand Down Expand Up @@ -155,4 +156,157 @@ describe("LiteLLMHandler", () => {
})
})
})

describe("GPT-5 model handling", () => {
it("should use max_completion_tokens instead of max_tokens for GPT-5 models", async () => {
const optionsWithGPT5: ApiHandlerOptions = {
...mockOptions,
litellmModelId: "gpt-5",
}
handler = new LiteLLMHandler(optionsWithGPT5)

const systemPrompt = "You are a helpful assistant"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello" }]

// Mock the stream response
const mockStream = {
async *[Symbol.asyncIterator]() {
yield {
choices: [{ delta: { content: "Hello!" } }],
usage: {
prompt_tokens: 10,
completion_tokens: 5,
},
}
},
}

mockCreate.mockReturnValue({
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
})

const generator = handler.createMessage(systemPrompt, messages)
const results = []
for await (const chunk of generator) {
results.push(chunk)
}

// Verify that create was called with max_completion_tokens instead of max_tokens
const createCall = mockCreate.mock.calls[0][0]

// Should have max_completion_tokens, not max_tokens
expect(createCall.max_completion_tokens).toBeDefined()
expect(createCall.max_tokens).toBeUndefined()
})

it("should use max_completion_tokens for various GPT-5 model variations", async () => {
const gpt5Variations = ["gpt-5", "gpt5", "GPT-5", "gpt-5-turbo", "gpt5-preview"]
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great test coverage! Consider adding edge cases like mixed case variations ("GpT-5", "gPt5") or models with additional suffixes ("gpt-5-32k", "gpt-5-vision") to ensure the detection works correctly for all possible GPT-5 model names.


for (const modelId of gpt5Variations) {
vi.clearAllMocks()

const optionsWithGPT5: ApiHandlerOptions = {
...mockOptions,
litellmModelId: modelId,
}
handler = new LiteLLMHandler(optionsWithGPT5)

const systemPrompt = "You are a helpful assistant"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test" }]

// Mock the stream response
const mockStream = {
async *[Symbol.asyncIterator]() {
yield {
choices: [{ delta: { content: "Response" } }],
usage: {
prompt_tokens: 10,
completion_tokens: 5,
},
}
},
}

mockCreate.mockReturnValue({
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
})

const generator = handler.createMessage(systemPrompt, messages)
for await (const chunk of generator) {
// Consume the generator
}

// Verify that create was called with max_completion_tokens for this model variation
const createCall = mockCreate.mock.calls[0][0]

expect(createCall.max_completion_tokens).toBeDefined()
expect(createCall.max_tokens).toBeUndefined()
}
})

it("should still use max_tokens for non-GPT-5 models", async () => {
const nonGPT5Models = ["gpt-4", "claude-3-opus", "llama-3", "gpt-4-turbo"]

for (const modelId of nonGPT5Models) {
vi.clearAllMocks()

const options: ApiHandlerOptions = {
...mockOptions,
litellmModelId: modelId,
}
handler = new LiteLLMHandler(options)

const systemPrompt = "You are a helpful assistant"
const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Test" }]

// Mock the stream response
const mockStream = {
async *[Symbol.asyncIterator]() {
yield {
choices: [{ delta: { content: "Response" } }],
usage: {
prompt_tokens: 10,
completion_tokens: 5,
},
}
},
}

mockCreate.mockReturnValue({
withResponse: vi.fn().mockResolvedValue({ data: mockStream }),
})

const generator = handler.createMessage(systemPrompt, messages)
for await (const chunk of generator) {
// Consume the generator
}

// Verify that create was called with max_tokens for non-GPT-5 models
const createCall = mockCreate.mock.calls[0][0]

expect(createCall.max_tokens).toBeDefined()
expect(createCall.max_completion_tokens).toBeUndefined()
}
})

it("should use max_completion_tokens in completePrompt for GPT-5 models", async () => {
const optionsWithGPT5: ApiHandlerOptions = {
...mockOptions,
litellmModelId: "gpt-5",
}
handler = new LiteLLMHandler(optionsWithGPT5)

mockCreate.mockResolvedValue({
choices: [{ message: { content: "Test response" } }],
})

await handler.completePrompt("Test prompt")

// Verify that create was called with max_completion_tokens
const createCall = mockCreate.mock.calls[0][0]

expect(createCall.max_completion_tokens).toBeDefined()
expect(createCall.max_tokens).toBeUndefined()
})
})
})
23 changes: 21 additions & 2 deletions src/api/providers/lite-llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,26 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
// Required by some providers; others default to max tokens allowed
let maxTokens: number | undefined = info.maxTokens ?? undefined

// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider extracting the GPT-5 model detection logic into a shared helper function. This logic (using modelId.toLowerCase().includes('gpt-5') || modelId.toLowerCase().includes('gpt5')) appears in both createMessage and completePrompt, and centralizing it would improve maintainability.

This comment was generated because it violated a code review rule: irule_tTqpIuNs8DV0QFGj.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The model detection logic could be more precise. Currently, modelId.toLowerCase().includes("gpt-5") would match unintended models like "not-gpt-5000". Consider using a more specific pattern:

Suggested change
const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
const modelLower = modelId.toLowerCase()
const isGPT5Model = modelLower.startsWith("gpt-5") || modelLower.startsWith("gpt5") || modelLower === "gpt5"


const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming = {
model: modelId,
max_tokens: maxTokens,
messages: [systemMessage, ...enhancedMessages],
stream: true,
stream_options: {
include_usage: true,
},
}

// GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider adding a comment explaining why GPT-5 models require this special handling, perhaps with a link to relevant Azure/OpenAI documentation. This would help future maintainers (including future me) understand the context behind this workaround.

if (isGPT5Model && maxTokens) {
// @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to avoid using @ts-ignore here? Could we extend the OpenAI types or create a custom interface that includes max_completion_tokens to maintain type safety? For example:

interface GPT5RequestOptions extends Omit<OpenAI.Chat.Completions.ChatCompletionCreateParamsStreaming, 'max_tokens'> {
  max_completion_tokens?: number
  max_tokens?: never
}

requestOptions.max_completion_tokens = maxTokens
} else if (maxTokens) {
requestOptions.max_tokens = maxTokens
}

if (this.supportsTemperature(modelId)) {
requestOptions.temperature = this.options.modelTemperature ?? 0
}
Expand Down Expand Up @@ -179,6 +189,9 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
async completePrompt(prompt: string): Promise<string> {
const { id: modelId, info } = await this.fetchModel()

// Check if this is a GPT-5 model that requires max_completion_tokens instead of max_tokens
const isGPT5Model = modelId.toLowerCase().includes("gpt-5") || modelId.toLowerCase().includes("gpt5")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This detection logic is duplicated from line 111. Would it be cleaner to extract this into a helper method to maintain DRY principles? Something like:

private isGPT5Model(modelId: string): boolean {
  const modelLower = modelId.toLowerCase()
  return modelLower.startsWith("gpt-5") || modelLower.startsWith("gpt5") || modelLower === "gpt5"
}


try {
const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = {
model: modelId,
Expand All @@ -189,7 +202,13 @@ export class LiteLLMHandler extends RouterProvider implements SingleCompletionHa
requestOptions.temperature = this.options.modelTemperature ?? 0
}

requestOptions.max_tokens = info.maxTokens
// GPT-5 models require max_completion_tokens instead of the deprecated max_tokens parameter
if (isGPT5Model && info.maxTokens) {
// @ts-ignore - max_completion_tokens is not in the OpenAI types yet but is supported
requestOptions.max_completion_tokens = info.maxTokens
} else if (info.maxTokens) {
requestOptions.max_tokens = info.maxTokens
}

const response = await this.client.chat.completions.create(requestOptions)
return response.choices[0]?.message.content || ""
Expand Down
Loading