Skip to content

Commit f6278b6

Browse files
[gpt-oss] Convert user input to harmony format (#22402)
Signed-off-by: Chen Zhang <[email protected]> Co-authored-by: Woosuk Kwon <[email protected]>
1 parent ad6c655 commit f6278b6

File tree

4 files changed

+216
-15
lines changed

4 files changed

+216
-15
lines changed

vllm/entrypoints/chat_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from openai.types.chat.chat_completion_content_part_input_audio_param import (
3030
InputAudio)
3131
from openai.types.responses import ResponseInputImageParam
32+
from openai_harmony import Message as OpenAIHarmonyMessage
3233
from PIL import Image
3334
from pydantic import BaseModel, ConfigDict, TypeAdapter
3435
# yapf: enable
@@ -207,7 +208,8 @@ class CustomChatCompletionMessageParam(TypedDict, total=False):
207208

208209

209210
ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam,
210-
CustomChatCompletionMessageParam]
211+
CustomChatCompletionMessageParam,
212+
OpenAIHarmonyMessage]
211213

212214

213215
# TODO: Make fields ReadOnly once mypy supports it

vllm/entrypoints/harmony_utils.py

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,18 @@
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
import datetime
44
from collections.abc import Iterable, Sequence
5-
from typing import Literal, Optional
5+
from typing import Literal, Optional, Union
66

7+
from openai.types.responses import ResponseFunctionToolCall, ResponseOutputItem
78
from openai.types.responses.tool import Tool
8-
from openai_harmony import (Conversation, DeveloperContent,
9+
from openai_harmony import (Author, Conversation, DeveloperContent,
910
HarmonyEncodingName, Message, ReasoningEffort,
1011
Role, StreamableParser, SystemContent, TextContent,
1112
ToolDescription, load_harmony_encoding)
1213

14+
from vllm.entrypoints.openai.protocol import (ResponseInputOutputItem,
15+
ResponseReasoningItem)
16+
1317
REASONING_EFFORT = {
1418
"high": ReasoningEffort.HIGH,
1519
"medium": ReasoningEffort.MEDIUM,
@@ -85,6 +89,58 @@ def get_user_message(content: str) -> Message:
8589
return Message.from_role_and_content(Role.USER, content)
8690

8791

92+
def parse_response_input(
93+
response_msg: ResponseInputOutputItem,
94+
prev_responses: list[Union[ResponseOutputItem, ResponseReasoningItem]]
95+
) -> Message:
96+
if not isinstance(response_msg, dict):
97+
response_msg = response_msg.model_dump()
98+
if "type" not in response_msg or response_msg["type"] == "message":
99+
role = response_msg["role"]
100+
content = response_msg["content"]
101+
if role == "system":
102+
# User is trying to set a system message. Change it to:
103+
# <|start|>developer<|message|># Instructions
104+
# {instructions}<|end|>
105+
role = "developer"
106+
text_prefix = "Instructions:\n"
107+
else:
108+
text_prefix = ""
109+
if isinstance(content, str):
110+
msg = Message.from_role_and_content(role, text_prefix + content)
111+
else:
112+
contents = [
113+
TextContent(text=text_prefix + c["text"]) for c in content
114+
]
115+
msg = Message.from_role_and_contents(role, contents)
116+
elif response_msg["type"] == "function_call_output":
117+
call_id = response_msg["call_id"]
118+
call_response: Optional[ResponseFunctionToolCall] = None
119+
for prev_response in reversed(prev_responses):
120+
if isinstance(prev_response, ResponseFunctionToolCall
121+
) and prev_response.call_id == call_id:
122+
call_response = prev_response
123+
break
124+
if call_response is None:
125+
raise ValueError(f"No call message found for {call_id}")
126+
msg = Message.from_author_and_content(
127+
Author.new(Role.TOOL, f"functions.{call_response.name}"),
128+
response_msg["output"])
129+
elif response_msg["type"] == "reasoning":
130+
content = response_msg["content"]
131+
assert len(content) == 1
132+
msg = Message.from_role_and_content(Role.ASSISTANT, content[0]["text"])
133+
elif response_msg["type"] == "function_call":
134+
msg = Message.from_role_and_content(Role.ASSISTANT,
135+
response_msg["arguments"])
136+
msg = msg.with_channel("commentary")
137+
msg = msg.with_recipient(f"functions.{response_msg['name']}")
138+
msg = msg.with_content_type("json")
139+
else:
140+
raise ValueError(f"Unknown input type: {response_msg['type']}")
141+
return msg
142+
143+
88144
def parse_chat_input(chat_msg) -> Message:
89145
role = chat_msg["role"]
90146
content = chat_msg["content"]

vllm/entrypoints/openai/protocol.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
from openai.types.chat.chat_completion_message import (
1818
Annotation as OpenAIAnnotation)
1919
# yapf: enable
20-
from openai.types.responses import (ResponseInputParam, ResponseOutputItem,
20+
from openai.types.responses import (ResponseFunctionToolCall,
21+
ResponseInputItemParam, ResponseOutputItem,
2122
ResponseOutputMessage, ResponsePrompt,
2223
ResponseStatus, ResponseTextConfig)
2324
from openai.types.responses.response import ToolChoice
@@ -234,6 +235,10 @@ def get_logits_processors(processors: Optional[LogitsProcessors],
234235
return None
235236

236237

238+
ResponseInputOutputItem: TypeAlias = Union[ResponseInputItemParam,
239+
ResponseFunctionToolCall]
240+
241+
237242
class ResponsesRequest(OpenAIBaseModel):
238243
# Ordered by official OpenAI API documentation
239244
# https://platform.openai.com/docs/api-reference/responses/create
@@ -248,7 +253,7 @@ class ResponsesRequest(OpenAIBaseModel):
248253
"reasoning.encrypted_content",
249254
],
250255
]] = None
251-
input: Union[str, ResponseInputParam]
256+
input: Union[str, list[ResponseInputOutputItem]]
252257
instructions: Optional[str] = None
253258
max_output_tokens: Optional[int] = None
254259
max_tool_calls: Optional[int] = None

vllm/entrypoints/openai/serving_responses.py

Lines changed: 148 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,26 @@
44
import asyncio
55
import time
66
from collections.abc import AsyncGenerator, AsyncIterator
7+
from copy import copy
78
from http import HTTPStatus
89
from typing import Callable, Final, Optional, Union
910

1011
import jinja2
1112
from fastapi import Request
12-
from openai.types.responses import ResponseOutputMessage, ResponseOutputText
13+
from openai.types.responses import (ResponseFunctionToolCall,
14+
ResponseOutputMessage, ResponseOutputText)
15+
from openai_harmony import Message as OpenAIHarmonyMessage
1316

1417
from vllm import envs
1518
from vllm.config import ModelConfig
1619
from vllm.engine.protocol import EngineClient
1720
from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
1821
ChatTemplateContentFormatOption)
1922
from vllm.entrypoints.context import ConversationContext, SimpleContext
23+
from vllm.entrypoints.harmony_utils import (
24+
get_developer_message, get_stop_tokens_for_assistant_actions,
25+
get_system_message, get_user_message, parse_response_input,
26+
render_for_completion)
2027
from vllm.entrypoints.logger import RequestLogger
2128
# yapf conflicts with isort for this block
2229
# yapf: disable
@@ -30,6 +37,7 @@
3037
from vllm.entrypoints.openai.serving_engine import OpenAIServing
3138
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
3239
from vllm.entrypoints.tool_server import ToolServer
40+
from vllm.inputs.data import TokensPrompt as EngineTokensPrompt
3341
from vllm.logger import init_logger
3442
from vllm.reasoning import ReasoningParser, ReasoningParserManager
3543
from vllm.sampling_params import SamplingParams
@@ -103,6 +111,29 @@ def __init__(
103111
"`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
104112
"cause a memory leak since we never remove responses from "
105113
"the store.")
114+
115+
self.use_harmony = model_config.hf_config.model_type == "gpt_oss"
116+
if self.use_harmony:
117+
logger.warning("For gpt-oss, we ignore --enable-auto-tool-choice "
118+
"and always enable tool use.")
119+
# OpenAI models have two EOS-like tokens: <|return|> and <|call|>.
120+
# We need to add them to the stop token ids.
121+
if "stop_token_ids" not in self.default_sampling_params:
122+
self.default_sampling_params["stop_token_ids"] = []
123+
self.default_sampling_params["stop_token_ids"].extend(
124+
get_stop_tokens_for_assistant_actions())
125+
126+
# set up tool use
127+
self.enable_auto_tools: bool = enable_auto_tools
128+
if self.enable_auto_tools:
129+
logger.info(
130+
"\"auto\" tool choice has been enabled please note that while"
131+
" the parallel_tool_calls client option is preset for "
132+
"compatibility reasons, it will be ignored.")
133+
if not self.use_harmony:
134+
raise NotImplementedError("Auto tool choice is not supported "
135+
"yet unless using Harmony")
136+
106137
# HACK(woosuk): This is a hack. We should use a better store.
107138
# FIXME: If enable_store=True, this may cause a memory leak since we
108139
# never remove responses from the store.
@@ -165,21 +196,20 @@ async def create_responses(
165196
return self._make_not_found_error(prev_response_id)
166197
else:
167198
prev_response = None
168-
# Construct the input messages.
169-
messages = self._construct_input_messages(request, prev_response)
170199

171200
try:
172201
lora_request = self._maybe_get_adapters(request)
173202
model_name = self._get_model_name(request.model, lora_request)
174203
tokenizer = await self.engine_client.get_tokenizer(lora_request)
175204

176-
_, request_prompts, engine_prompts = await self._preprocess_chat(
177-
request,
178-
tokenizer,
179-
messages,
180-
chat_template=self.chat_template,
181-
chat_template_content_format=self.chat_template_content_format,
182-
)
205+
if self.use_harmony:
206+
messages, request_prompts, engine_prompts = (
207+
self._make_request_with_harmony(request, prev_response))
208+
else:
209+
messages, request_prompts, engine_prompts = (
210+
await self._make_request(request, prev_response,
211+
tokenizer))
212+
183213
except (ValueError, TypeError, RuntimeError,
184214
jinja2.TemplateError) as e:
185215
logger.exception("Error in preprocessing prompt inputs")
@@ -275,6 +305,38 @@ async def create_responses(
275305
except Exception as e:
276306
return self.create_error_response(str(e))
277307

308+
async def _make_request(
309+
self,
310+
request: ResponsesRequest,
311+
prev_response: Optional[ResponsesResponse],
312+
tokenizer: AnyTokenizer,
313+
):
314+
# Construct the input messages.
315+
messages = self._construct_input_messages(request, prev_response)
316+
_, request_prompts, engine_prompts = await self._preprocess_chat(
317+
request,
318+
tokenizer,
319+
messages,
320+
chat_template=self.chat_template,
321+
chat_template_content_format=self.chat_template_content_format,
322+
)
323+
return messages, request_prompts, engine_prompts
324+
325+
def _make_request_with_harmony(
326+
self,
327+
request: ResponsesRequest,
328+
prev_response: Optional[ResponsesResponse],
329+
):
330+
if request.tool_choice != "auto":
331+
raise NotImplementedError(
332+
"Only 'auto' tool_choice is supported in "
333+
"response API with Harmony")
334+
messages = self._construct_input_messages_with_harmony(
335+
request, prev_response)
336+
prompt_token_ids = render_for_completion(messages)
337+
engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids)
338+
return messages, [prompt_token_ids], [engine_prompt]
339+
278340
async def responses_full_generator(
279341
self,
280342
request: ResponsesRequest,
@@ -411,6 +473,82 @@ def _construct_input_messages(
411473
messages.extend(request.input) # type: ignore
412474
return messages
413475

476+
def _construct_input_messages_with_harmony(
477+
self,
478+
request: ResponsesRequest,
479+
prev_response: Optional[ResponsesResponse],
480+
) -> list[OpenAIHarmonyMessage]:
481+
messages: list[OpenAIHarmonyMessage] = []
482+
if prev_response is None:
483+
# New conversation.
484+
reasoning_effort = (request.reasoning.effort
485+
if request.reasoning else None)
486+
tool_types = [tool.type for tool in request.tools]
487+
enable_browser = ("web_search_preview" in tool_types
488+
and self.tool_server is not None
489+
and self.tool_server.has_tool("browser"))
490+
enable_code_interpreter = ("code_interpreter" in tool_types
491+
and self.tool_server is not None
492+
and self.tool_server.has_tool("python"))
493+
sys_msg = get_system_message(
494+
reasoning_effort=reasoning_effort,
495+
browser_description=self.tool_server.get_tool_description(
496+
"browser")
497+
if enable_browser and self.tool_server is not None else None,
498+
python_description=self.tool_server.get_tool_description(
499+
"python") if enable_code_interpreter
500+
and self.tool_server is not None else None,
501+
)
502+
messages.append(sys_msg)
503+
dev_msg = get_developer_message(request.instructions,
504+
request.tools)
505+
messages.append(dev_msg)
506+
else:
507+
# Continue the previous conversation.
508+
# FIXME(woosuk): Currently, request params like reasoning and
509+
# instructions are ignored.
510+
prev_msgs = self.msg_store[prev_response.id]
511+
# Remove the previous chain-of-thoughts if there is a new "final"
512+
# message. Note that this also removes these messages from the
513+
# msg_store.
514+
if len(prev_msgs) > 0:
515+
last_msg = prev_msgs[-1]
516+
assert isinstance(last_msg, OpenAIHarmonyMessage)
517+
if last_msg.channel == "final":
518+
prev_final_msg_idx = -1
519+
for i in range(len(prev_msgs) - 2, -1, -1):
520+
prev_msg_i = prev_msgs[i]
521+
assert isinstance(prev_msg_i, OpenAIHarmonyMessage)
522+
if prev_msg_i.channel == "final":
523+
prev_final_msg_idx = i
524+
break
525+
recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1:]
526+
del prev_msgs[prev_final_msg_idx + 1:]
527+
for msg in recent_turn_msgs:
528+
assert isinstance(msg, OpenAIHarmonyMessage)
529+
if msg.channel != "analysis":
530+
prev_msgs.append(msg)
531+
messages.extend(prev_msgs)
532+
# Append the new input.
533+
# Reponses API supports simple text inputs without chat format.
534+
if isinstance(request.input, str):
535+
messages.append(get_user_message(request.input))
536+
else:
537+
if prev_response is not None:
538+
prev_outputs = copy(prev_response.output)
539+
else:
540+
prev_outputs = []
541+
for response_msg in request.input:
542+
messages.append(
543+
parse_response_input(response_msg, prev_outputs))
544+
# User passes in a a tool call request and its output. We need
545+
# to add the tool call request to prev_outputs so that the
546+
# parse_response_input can find the tool call request when
547+
# parsing the tool call output.
548+
if isinstance(response_msg, ResponseFunctionToolCall):
549+
prev_outputs.append(response_msg)
550+
return messages
551+
414552
async def _run_background_request(
415553
self,
416554
request: ResponsesRequest,

0 commit comments

Comments
 (0)