Skip to content

Commit f263a4b

Browse files
authored
[gpt-oss] Support chat completion api (#22342)
1 parent 54991c5 commit f263a4b

File tree

3 files changed

+183
-24
lines changed

3 files changed

+183
-24
lines changed

vllm/entrypoints/harmony_utils.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
import datetime
4+
from collections.abc import Iterable
45
from typing import Literal, Optional
56

67
from openai.types.responses.tool import Tool
@@ -109,3 +110,36 @@ def get_stop_tokens_for_assistant_actions() -> list[int]:
109110

110111
def get_streamable_parser_for_assistant() -> StreamableParser:
111112
return StreamableParser(get_encoding(), role=Role.ASSISTANT)
113+
114+
115+
def parse_output_into_messages(token_ids: Iterable[int]) -> StreamableParser:
116+
parser = get_streamable_parser_for_assistant()
117+
for token_id in token_ids:
118+
parser.process(token_id)
119+
return parser
120+
121+
122+
def parse_chat_output(
123+
token_ids: list[int]) -> tuple[Optional[str], Optional[str], bool]:
124+
parser = parse_output_into_messages(token_ids)
125+
output_msgs = parser.messages
126+
if len(output_msgs) == 0:
127+
# The generation has stopped during reasoning.
128+
is_tool_call = False
129+
reasoning_content = parser.current_content
130+
final_content = None
131+
elif len(output_msgs) == 1:
132+
# The generation has stopped during final message.
133+
is_tool_call = False
134+
reasoning_content = output_msgs[0].content[0].text
135+
final_content = parser.current_content
136+
else:
137+
if len(output_msgs) != 2:
138+
raise ValueError(
139+
"Expected 2 output messages (reasoning and final), "
140+
f"but got {len(output_msgs)}.")
141+
reasoning_msg, final_msg = output_msgs
142+
reasoning_content = reasoning_msg.content[0].text
143+
final_content = final_msg.content[0].text
144+
is_tool_call = final_msg.recipient is not None
145+
return reasoning_content, final_content, is_tool_call

vllm/entrypoints/openai/protocol.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,7 @@ def to_sampling_params(
323323
if (top_p := self.top_p) is None:
324324
top_p = default_sampling_params.get(
325325
"top_p", self._DEFAULT_SAMPLING_PARAMS["top_p"])
326+
stop_token_ids = default_sampling_params.get("stop_token_ids")
326327

327328
# Structured output
328329
guided_decoding = None
@@ -340,6 +341,7 @@ def to_sampling_params(
340341
top_p=top_p,
341342
max_tokens=max_tokens,
342343
logprobs=self.top_logprobs,
344+
stop_token_ids=stop_token_ids,
343345
output_kind=(RequestOutputKind.DELTA
344346
if self.stream else RequestOutputKind.FINAL_ONLY),
345347
guided_decoding=guided_decoding,
@@ -404,6 +406,8 @@ class ChatCompletionRequest(OpenAIBaseModel):
404406
Literal["required"],
405407
ChatCompletionNamedToolChoiceParam,
406408
]] = "none"
409+
reasoning_effort: Optional[Literal["low", "medium", "high"]] = None
410+
include_reasoning: bool = True
407411

408412
# NOTE this will be ignored by vLLM -- the model determines the behavior
409413
parallel_tool_calls: Optional[bool] = False

vllm/entrypoints/openai/serving_chat.py

Lines changed: 145 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,18 @@
1212
import partial_json_parser
1313
import regex as re
1414
from fastapi import Request
15+
from openai_harmony import Message as OpenAIMessage
1516
from pydantic import TypeAdapter
1617

1718
from vllm.config import ModelConfig
1819
from vllm.engine.protocol import EngineClient
1920
from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
2021
ConversationMessage,
2122
random_tool_call_id)
23+
from vllm.entrypoints.harmony_utils import (
24+
get_developer_message, get_stop_tokens_for_assistant_actions,
25+
get_streamable_parser_for_assistant, get_system_message, parse_chat_input,
26+
parse_chat_output, render_for_completion)
2227
from vllm.entrypoints.logger import RequestLogger
2328
from vllm.entrypoints.openai.protocol import (
2429
ChatCompletionLogProb, ChatCompletionLogProbs,
@@ -35,6 +40,7 @@
3540
from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (
3641
MistralToolCall)
3742
from vllm.entrypoints.utils import get_max_tokens
43+
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
3844
from vllm.logger import init_logger
3945
from vllm.outputs import CompletionOutput, RequestOutput
4046
from vllm.reasoning import ReasoningParser, ReasoningParserManager
@@ -125,6 +131,23 @@ def __init__(
125131
logger.info("Using default chat sampling params from %s: %s",
126132
source, self.default_sampling_params)
127133

134+
self.use_harmony = model_config.hf_config.model_type == "gpt_oss"
135+
if self.use_harmony:
136+
if "stop_token_ids" not in self.default_sampling_params:
137+
self.default_sampling_params["stop_token_ids"] = []
138+
self.default_sampling_params["stop_token_ids"].extend(
139+
get_stop_tokens_for_assistant_actions())
140+
141+
# NOTE(woosuk): While OpenAI's chat completion API supports browsing
142+
# for some models, currently vLLM doesn't support it. Please use the
143+
# Responses API instead.
144+
self.supports_browsing = False
145+
self.browser_tool = None
146+
# NOTE(woosuk): Chat completion API does not support code interpreter.
147+
# Please use the Responses API instead.
148+
self.supports_code_interpreter = False
149+
self.python_tool = None
150+
128151
async def create_chat_completion(
129152
self,
130153
request: ChatCompletionRequest,
@@ -169,7 +192,8 @@ async def create_chat_completion(
169192

170193
if (request.tool_choice == "auto" and
171194
not (self.enable_auto_tools and tool_parser is not None)
172-
and not isinstance(tokenizer, MistralTokenizer)):
195+
and not isinstance(tokenizer, MistralTokenizer)
196+
and not self.use_harmony):
173197
# for hf tokenizers, "auto" tools requires
174198
# --enable-auto-tool-choice and --tool-call-parser
175199
return self.create_error_response(
@@ -184,25 +208,35 @@ async def create_chat_completion(
184208
else:
185209
tool_dicts = [tool.model_dump() for tool in request.tools]
186210

187-
(
188-
conversation,
189-
request_prompts,
190-
engine_prompts,
191-
) = await self._preprocess_chat(
192-
request,
193-
tokenizer,
194-
request.messages,
195-
chat_template=request.chat_template or self.chat_template,
196-
chat_template_content_format=self.chat_template_content_format,
197-
add_generation_prompt=request.add_generation_prompt,
198-
continue_final_message=request.continue_final_message,
199-
tool_dicts=tool_dicts,
200-
documents=request.documents,
201-
chat_template_kwargs=request.chat_template_kwargs,
202-
tool_parser=tool_parser,
203-
truncate_prompt_tokens=request.truncate_prompt_tokens,
204-
add_special_tokens=request.add_special_tokens,
205-
)
211+
if not self.use_harmony:
212+
# Common case.
213+
(
214+
conversation,
215+
request_prompts,
216+
engine_prompts,
217+
) = await self._preprocess_chat(
218+
request,
219+
tokenizer,
220+
request.messages,
221+
chat_template=request.chat_template or self.chat_template,
222+
chat_template_content_format=self.
223+
chat_template_content_format,
224+
add_generation_prompt=request.add_generation_prompt,
225+
continue_final_message=request.continue_final_message,
226+
tool_dicts=tool_dicts,
227+
documents=request.documents,
228+
chat_template_kwargs=request.chat_template_kwargs,
229+
tool_parser=tool_parser,
230+
truncate_prompt_tokens=request.truncate_prompt_tokens,
231+
add_special_tokens=request.add_special_tokens,
232+
)
233+
else:
234+
# For GPT-OSS.
235+
(
236+
conversation,
237+
request_prompts,
238+
engine_prompts,
239+
) = self._make_request_with_harmony(request)
206240
except (ValueError, TypeError, RuntimeError,
207241
jinja2.TemplateError) as e:
208242
logger.exception("Error in preprocessing prompt inputs")
@@ -436,6 +470,11 @@ async def chat_completion_stream_generator(
436470
finish_reason_sent = [False] * num_choices
437471
num_prompt_tokens = 0
438472
num_cached_tokens = None
473+
if self.use_harmony:
474+
harmony_parsers = [
475+
get_streamable_parser_for_assistant()
476+
for _ in range(num_choices)
477+
]
439478

440479
if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
441480
tool_choice_function_name = request.tool_choice.function.name
@@ -597,7 +636,18 @@ async def chat_completion_stream_generator(
597636
else:
598637
logprobs = None
599638

600-
delta_text = output.text
639+
if self.use_harmony:
640+
harmony_parser = harmony_parsers[i]
641+
for token_id in output.token_ids:
642+
harmony_parser.process(token_id)
643+
# FIXME(woosuk): Support function calling
644+
is_final = harmony_parser.current_channel == "final"
645+
if not (request.include_reasoning or is_final):
646+
# Skip the reasoning content.
647+
continue
648+
delta_text = harmony_parser.last_content_delta or ""
649+
else:
650+
delta_text = output.text
601651

602652
if not delta_text and not output.token_ids and \
603653
not previous_num_tokens[i]:
@@ -607,7 +657,8 @@ async def chat_completion_stream_generator(
607657
delta_message: Optional[DeltaMessage]
608658

609659
# just update previous_texts and previous_token_ids
610-
if tool_choice_auto or self.reasoning_parser:
660+
if ((tool_choice_auto or self.reasoning_parser)
661+
and not self.use_harmony):
611662
assert previous_texts is not None
612663
assert all_previous_token_ids is not None
613664
previous_text = previous_texts[i]
@@ -621,8 +672,14 @@ async def chat_completion_stream_generator(
621672
else:
622673
current_token_ids = list(output.token_ids)
623674

675+
if self.use_harmony:
676+
if is_final:
677+
delta_message = DeltaMessage(content=delta_text)
678+
else:
679+
delta_message = DeltaMessage(
680+
reasoning_content=delta_text)
624681
# handle streaming deltas for tools with named tool_choice
625-
if tool_choice_function_name:
682+
elif tool_choice_function_name:
626683
if (self.reasoning_parser and not reasoning_end_arr[i]
627684
and not reasoning_parser.is_reasoning_end(
628685
previous_token_ids)):
@@ -990,7 +1047,38 @@ async def chat_completion_full_generator(
9901047
)
9911048
else:
9921049
logprobs = None
993-
auto_tools_called = False
1050+
1051+
if self.use_harmony:
1052+
reasoning_content, final_content, is_tool_call = (
1053+
parse_chat_output(token_ids))
1054+
if not request.include_reasoning:
1055+
reasoning_content = None
1056+
1057+
if is_tool_call:
1058+
# TODO(woosuk): Implement tool call for gpt-oss.
1059+
# For now, only Responses API supports tool call for
1060+
# gpt-oss.
1061+
raise NotImplementedError(
1062+
"Tool call in Chat Completion API is not supported "
1063+
"for gpt-oss yet. Please use Responses API instead.")
1064+
else:
1065+
# Normal message
1066+
message = ChatMessage(
1067+
role=role,
1068+
reasoning_content=reasoning_content,
1069+
content=final_content,
1070+
)
1071+
1072+
choice_data = ChatCompletionResponseChoice(
1073+
index=output.index,
1074+
message=message,
1075+
logprobs=logprobs,
1076+
finish_reason="tool_calls" if is_tool_call else
1077+
output.finish_reason if output.finish_reason else "stop",
1078+
stop_reason=output.stop_reason,
1079+
)
1080+
choices.append(choice_data)
1081+
continue
9941082

9951083
if self.reasoning_parser:
9961084
try:
@@ -1003,10 +1091,13 @@ async def chat_completion_full_generator(
10031091
reasoning_content, content = (
10041092
reasoning_parser.extract_reasoning_content(
10051093
output.text, request=request))
1094+
if not request.include_reasoning:
1095+
reasoning_content = None
10061096
else:
10071097
reasoning_content = None
10081098
content = output.text
10091099

1100+
auto_tools_called = False
10101101
# if auto tools are not enabled, and a named tool choice using
10111102
# outlines is not being used
10121103
if (not self.enable_auto_tools or not self.tool_parser) and \
@@ -1261,3 +1352,33 @@ def _should_check_for_unstreamed_tool_arg_tokens(
12611352
and delta_message.tool_calls[0].function
12621353
and delta_message.tool_calls[0].function.arguments is not None
12631354
)
1355+
1356+
def _make_request_with_harmony(
1357+
self,
1358+
request: ChatCompletionRequest,
1359+
):
1360+
messages: list[OpenAIMessage] = []
1361+
1362+
# Add system message.
1363+
# NOTE: In Chat Completion API, browsing is enabled by default
1364+
# if the model supports it. TODO: Support browsing.
1365+
assert not self.supports_browsing
1366+
assert not self.supports_code_interpreter
1367+
sys_msg = get_system_message(
1368+
reasoning_effort=request.reasoning_effort,
1369+
browser_description=None,
1370+
python_description=None)
1371+
messages.append(sys_msg)
1372+
1373+
# Add developer message.
1374+
dev_msg = get_developer_message()
1375+
messages.append(dev_msg)
1376+
1377+
# Add user message.
1378+
for chat_msg in request.messages:
1379+
messages.append(parse_chat_input(chat_msg))
1380+
1381+
# Render prompt token ids.
1382+
prompt_token_ids = render_for_completion(messages)
1383+
engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
1384+
return messages, [prompt_token_ids], [engine_prompt]

0 commit comments

Comments
 (0)