Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 111 additions & 8 deletions vllm/entrypoints/harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
ResponseOutputItem, ResponseOutputMessage,
ResponseOutputText, ResponseReasoningItem)
from openai.types.responses.response_function_web_search import (
ActionFind, ActionOpenPage, ActionSearch, ResponseFunctionWebSearch)
ActionFind, ActionOpenPage, ActionSearch, McpCall, ResponseFunctionWebSearch)
from openai.types.responses.response_reasoning_item import (
Content as ResponseReasoningTextContent)
from openai.types.responses.tool import Tool
Expand All @@ -18,7 +18,7 @@
Role, StreamableParser, SystemContent, TextContent,
ToolDescription, load_harmony_encoding)

from vllm.entrypoints.openai.protocol import ResponseInputOutputItem
from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, MessageMetadata, ResponseOutputItemWithMetadata
from vllm.utils import random_uuid

REASONING_EFFORT = {
Expand Down Expand Up @@ -166,8 +166,7 @@ def render_for_completion(messages: list[Message]) -> list[int]:
conversation, Role.ASSISTANT)
return token_ids


def parse_output_message(message: Message) -> list[ResponseOutputItem]:
def parse_output_message_openai_api_behavior(message: Message) -> list[ResponseOutputItem]:
"""
Parse a Harmony message into a list of output response items.
"""
Expand Down Expand Up @@ -268,9 +267,102 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
raise ValueError(f"Unknown channel: {message.channel}")
return output_items

def parse_output_message_verbose(message: Message, previous_output_items: list[ResponseOutputItemWithMetadata]) -> list[ResponseOutputItemWithMetadata]:
output_items: list[ResponseOutputItem] = []
recipient = message.recipient
message_metadata = MessageMetadata(
author=str(message.author),
channel=message.channel,
recipient=message.recipient,
content_type=message.content_type
)
# When recipient is not None, it is always a tool call or tool call output
if recipient is not None:
# This means that it is a tool call output of some sort
if recipient == "assistant":
mcp_call = None
# TODO: Support parallel tool calls
for prev_response in reversed(previous_output_items):
item = prev_response.item
if isinstance(item, McpCall):
mcp_call = item
break
if mcp_call is None:
raise ValueError("Received a tool call output without a prior tool call")
mcp_call.output = message.content[0].text_content
# No need to append any message since we are modifying in place
# Currently assuming that any tool call recipient that starts with functions is executed client side, so is a ResponseFunctionToolCall
elif recipient.startswith('functions'):
function_name = recipient.split(".")[-1]
for content in message.content:
random_id = random_uuid()
response_item = ResponseFunctionToolCall(
arguments=content.text,
call_id=f"call_{random_id}",
type="function_call",
name=function_name,
id=f"ft_{random_id}",
)
output_items.append(response_item)
else:
for content in message.content:
random_id = random_uuid()
response_item = McpCall(
arguments=content.text,
id=f"call_{random_id}",
type="mcp_call",
name=recipient,
server_label=recipient.split('.')[0]
)
output_items.append(response_item)
# Any messages without recipient to these channels is a reasoning message
elif message.channel == "analysis" or message.channel == "commentary":
for content in message.content:
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
summary=[],
type="reasoning",
content=[
ResponseReasoningTextContent(text=content.text,
type="reasoning_text")
],
status=None,
)
output_items.append(reasoning_item)
# Final channel means this is not reasoning and is a message to show to the user
elif message.channel == "final":
contents = []
for content in message.content:
output_text = ResponseOutputText(
text=content.text,
annotations=[], # TODO
type="output_text",
logprobs=None, # TODO
)
contents.append(output_text)
text_item = ResponseOutputMessage(
id=f"msg_{random_uuid()}",
content=contents,
role=message.author.role,
status="completed",
type="message",
)
output_items.append(text_item)
else:
raise ValueError(f"Unknown channel: {message.channel}")

# Convert each ResponseOutputItem to ResponseOutputItemWithMetadata
result = []
for item in output_items:
result.append(ResponseOutputItemWithMetadata(
item=item,
metadata=message_metadata
))
return result


def parse_remaining_state(
parser: StreamableParser) -> list[ResponseOutputItem]:
parser: StreamableParser, verbose: bool = False) -> Union[list[ResponseOutputItem], list[ResponseOutputItemWithMetadata]]:
if not parser.current_content:
return []
if parser.current_role != Role.ASSISTANT:
Expand All @@ -280,6 +372,8 @@ def parse_remaining_state(
and current_recipient.startswith("browser.")):
return []

output_items = []

if parser.current_channel == "analysis":
reasoning_item = ResponseReasoningItem(
id=f"rs_{random_uuid()}",
Expand All @@ -291,7 +385,7 @@ def parse_remaining_state(
],
status=None,
)
return [reasoning_item]
output_items = [reasoning_item]
elif parser.current_channel == "final":
output_text = ResponseOutputText(
text=parser.current_content,
Expand All @@ -306,8 +400,17 @@ def parse_remaining_state(
status="completed",
type="message",
)
return [text_item]
return []
output_items = [text_item]
if verbose:
message_metadata = MessageMetadata(author=Author(role=parser.current_role),
channel=parser.current_channel,
recipient=parser.current_recipient,
content_type=parser.current_content_type)
return [ResponseOutputItemWithMetadata(
item=output_items[0],
metadata=message_metadata
)]
return output_items


def get_stop_tokens_for_assistant_actions() -> list[int]:
Expand Down
18 changes: 14 additions & 4 deletions vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -1843,6 +1843,16 @@ class ResponseUsage(OpenAIBaseModel):
output_tokens_details: OutputTokensDetails
total_tokens: int

class MessageMetadata(BaseModel):
author: Optional[str] = None
channel: Optional[str] = None
recipient: Optional[str] = None
content_type: Optional[str] = None

class ResponseOutputItemWithMetadata(BaseModel):
item: ResponseOutputItem
metadata: MessageMetadata


class ResponsesResponse(OpenAIBaseModel):
id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
Expand All @@ -1853,7 +1863,7 @@ class ResponsesResponse(OpenAIBaseModel):
metadata: Optional[Metadata] = None
model: str
object: Literal["response"] = "response"
output: list[ResponseOutputItem]
output: Union[list[ResponseOutputItem], list[ResponseOutputItemWithMetadata]]
parallel_tool_calls: bool
temperature: float
tool_choice: ToolChoice
Expand All @@ -1880,7 +1890,7 @@ def from_request(
sampling_params: SamplingParams,
model_name: str,
created_time: int,
output: list[ResponseOutputItem],
output: Union[list[ResponseOutputItem], list[ResponseOutputItemWithMetadata]],
status: ResponseStatus,
usage: Optional[ResponseUsage] = None,
) -> "ResponsesResponse":
Expand Down Expand Up @@ -2089,7 +2099,7 @@ class DetokenizeResponse(OpenAIBaseModel):

class TokenizerInfoResponse(OpenAIBaseModel):
"""
Response containing tokenizer configuration
Response containing tokenizer configuration
equivalent to tokenizer_config.json
"""

Expand Down Expand Up @@ -2179,7 +2189,7 @@ class TranscriptionRequest(OpenAIBaseModel):
to_language: Optional[str] = None
"""The language of the output audio we transcribe to.

Please note that this is not currently used by supported models at this
Please note that this is not currently used by supported models at this
time, but it is a placeholder for future use, matching translation api.
"""

Expand Down
12 changes: 9 additions & 3 deletions vllm/entrypoints/openai/serving_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
SimpleContext, StreamingHarmonyContext)
from vllm.entrypoints.harmony_utils import (
get_developer_message, get_stop_tokens_for_assistant_actions,
get_system_message, get_user_message, parse_output_message,
get_system_message, get_user_message, parse_output_message_openai_api_behavior, parse_output_message_verbose,
parse_remaining_state, parse_response_input, render_for_completion)
from vllm.entrypoints.logger import RequestLogger
# yapf conflicts with isort for this block
Expand Down Expand Up @@ -89,6 +89,7 @@ def __init__(
enable_force_include_usage: bool = False,
enable_log_outputs: bool = False,
log_error_stack: bool = False,
enable_openai_api_behavior: bool = True
) -> None:
super().__init__(
engine_client=engine_client,
Expand All @@ -103,6 +104,8 @@ def __init__(
self.chat_template = chat_template
self.chat_template_content_format: Final = chat_template_content_format
self.enable_log_outputs = enable_log_outputs
# This flag is used to determine whether to match OpenAI API behavior or return reasoning metadata and more tool information
self.enable_openai_api_behavior = enable_openai_api_behavior

self.reasoning_parser: Optional[Callable[[AnyTokenizer],
ReasoningParser]] = None
Expand Down Expand Up @@ -617,9 +620,12 @@ def _make_response_output_items_with_harmony(
output_items = []
num_init_messages = context.num_init_messages
for msg in context.messages[num_init_messages:]:
output_items.extend(parse_output_message(msg))
if self.enable_openai_api_behavior:
output_items.extend(parse_output_message_openai_api_behavior(msg))
else:
output_items.extend(parse_output_message_verbose(msg, previous_output_items=output_items))
# Handle the generation stopped in the middle (if any).
last_items = parse_remaining_state(context.parser)
last_items = parse_remaining_state(context.parser, self.enable_openai_api_behavior)
if last_items:
output_items.extend(last_items)
return output_items
Expand Down