diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index 078d31684425..50a322fd401f 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -9,7 +9,7 @@ ResponseOutputItem, ResponseOutputMessage, ResponseOutputText, ResponseReasoningItem) from openai.types.responses.response_function_web_search import ( - ActionFind, ActionOpenPage, ActionSearch, ResponseFunctionWebSearch) + ActionFind, ActionOpenPage, ActionSearch, McpCall, ResponseFunctionWebSearch) from openai.types.responses.response_reasoning_item import ( Content as ResponseReasoningTextContent) from openai.types.responses.tool import Tool @@ -18,7 +18,7 @@ Role, StreamableParser, SystemContent, TextContent, ToolDescription, load_harmony_encoding) -from vllm.entrypoints.openai.protocol import ResponseInputOutputItem +from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, MessageMetadata, ResponseOutputItemWithMetadata from vllm.utils import random_uuid REASONING_EFFORT = { @@ -166,8 +166,7 @@ def render_for_completion(messages: list[Message]) -> list[int]: conversation, Role.ASSISTANT) return token_ids - -def parse_output_message(message: Message) -> list[ResponseOutputItem]: +def parse_output_message_openai_api_behavior(message: Message) -> list[ResponseOutputItem]: """ Parse a Harmony message into a list of output response items. """ @@ -268,9 +267,102 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]: raise ValueError(f"Unknown channel: {message.channel}") return output_items +def parse_output_message_verbose(message: Message, previous_output_items: list[ResponseOutputItemWithMetadata]) -> list[ResponseOutputItemWithMetadata]: + output_items: list[ResponseOutputItem] = [] + recipient = message.recipient + message_metadata = MessageMetadata( + author=str(message.author), + channel=message.channel, + recipient=message.recipient, + content_type=message.content_type + ) + # When recipient is not None, it is always a tool call or tool call output + if recipient is not None: + # This means that it is a tool call output of some sort + if recipient == "assistant": + mcp_call = None + # TODO: Support parallel tool calls + for prev_response in reversed(previous_output_items): + item = prev_response.item + if isinstance(item, McpCall): + mcp_call = item + break + if mcp_call is None: + raise ValueError("Received a tool call output without a prior tool call") + mcp_call.output = message.content[0].text_content + # No need to append any message since we are modifying in place + # Currently assuming that any tool call recipient that starts with functions is executed client side, so is a ResponseFunctionToolCall + elif recipient.startswith('functions'): + function_name = recipient.split(".")[-1] + for content in message.content: + random_id = random_uuid() + response_item = ResponseFunctionToolCall( + arguments=content.text, + call_id=f"call_{random_id}", + type="function_call", + name=function_name, + id=f"ft_{random_id}", + ) + output_items.append(response_item) + else: + for content in message.content: + random_id = random_uuid() + response_item = McpCall( + arguments=content.text, + id=f"call_{random_id}", + type="mcp_call", + name=recipient, + server_label=recipient.split('.')[0] + ) + output_items.append(response_item) + # Any messages without recipient to these channels is a reasoning message + elif message.channel == "analysis" or message.channel == "commentary": + for content in message.content: + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent(text=content.text, + type="reasoning_text") + ], + status=None, + ) + output_items.append(reasoning_item) + # Final channel means this is not reasoning and is a message to show to the user + elif message.channel == "final": + contents = [] + for content in message.content: + output_text = ResponseOutputText( + text=content.text, + annotations=[], # TODO + type="output_text", + logprobs=None, # TODO + ) + contents.append(output_text) + text_item = ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=contents, + role=message.author.role, + status="completed", + type="message", + ) + output_items.append(text_item) + else: + raise ValueError(f"Unknown channel: {message.channel}") + + # Convert each ResponseOutputItem to ResponseOutputItemWithMetadata + result = [] + for item in output_items: + result.append(ResponseOutputItemWithMetadata( + item=item, + metadata=message_metadata + )) + return result + def parse_remaining_state( - parser: StreamableParser) -> list[ResponseOutputItem]: + parser: StreamableParser, verbose: bool = False) -> Union[list[ResponseOutputItem], list[ResponseOutputItemWithMetadata]]: if not parser.current_content: return [] if parser.current_role != Role.ASSISTANT: @@ -280,6 +372,8 @@ def parse_remaining_state( and current_recipient.startswith("browser.")): return [] + output_items = [] + if parser.current_channel == "analysis": reasoning_item = ResponseReasoningItem( id=f"rs_{random_uuid()}", @@ -291,7 +385,7 @@ def parse_remaining_state( ], status=None, ) - return [reasoning_item] + output_items = [reasoning_item] elif parser.current_channel == "final": output_text = ResponseOutputText( text=parser.current_content, @@ -306,8 +400,17 @@ def parse_remaining_state( status="completed", type="message", ) - return [text_item] - return [] + output_items = [text_item] + if verbose: + message_metadata = MessageMetadata(author=Author(role=parser.current_role), + channel=parser.current_channel, + recipient=parser.current_recipient, + content_type=parser.current_content_type) + return [ResponseOutputItemWithMetadata( + item=output_items[0], + metadata=message_metadata + )] + return output_items def get_stop_tokens_for_assistant_actions() -> list[int]: diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 488102232562..440647b7855c 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1843,6 +1843,16 @@ class ResponseUsage(OpenAIBaseModel): output_tokens_details: OutputTokensDetails total_tokens: int +class MessageMetadata(BaseModel): + author: Optional[str] = None + channel: Optional[str] = None + recipient: Optional[str] = None + content_type: Optional[str] = None + +class ResponseOutputItemWithMetadata(BaseModel): + item: ResponseOutputItem + metadata: MessageMetadata + class ResponsesResponse(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") @@ -1853,7 +1863,7 @@ class ResponsesResponse(OpenAIBaseModel): metadata: Optional[Metadata] = None model: str object: Literal["response"] = "response" - output: list[ResponseOutputItem] + output: Union[list[ResponseOutputItem], list[ResponseOutputItemWithMetadata]] parallel_tool_calls: bool temperature: float tool_choice: ToolChoice @@ -1880,7 +1890,7 @@ def from_request( sampling_params: SamplingParams, model_name: str, created_time: int, - output: list[ResponseOutputItem], + output: Union[list[ResponseOutputItem], list[ResponseOutputItemWithMetadata]], status: ResponseStatus, usage: Optional[ResponseUsage] = None, ) -> "ResponsesResponse": @@ -2089,7 +2099,7 @@ class DetokenizeResponse(OpenAIBaseModel): class TokenizerInfoResponse(OpenAIBaseModel): """ - Response containing tokenizer configuration + Response containing tokenizer configuration equivalent to tokenizer_config.json """ @@ -2179,7 +2189,7 @@ class TranscriptionRequest(OpenAIBaseModel): to_language: Optional[str] = None """The language of the output audio we transcribe to. - Please note that this is not currently used by supported models at this + Please note that this is not currently used by supported models at this time, but it is a placeholder for future use, matching translation api. """ diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 4c15de303099..6cb1cbc1b010 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -41,7 +41,7 @@ SimpleContext, StreamingHarmonyContext) from vllm.entrypoints.harmony_utils import ( get_developer_message, get_stop_tokens_for_assistant_actions, - get_system_message, get_user_message, parse_output_message, + get_system_message, get_user_message, parse_output_message_openai_api_behavior, parse_output_message_verbose, parse_remaining_state, parse_response_input, render_for_completion) from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block @@ -89,6 +89,7 @@ def __init__( enable_force_include_usage: bool = False, enable_log_outputs: bool = False, log_error_stack: bool = False, + enable_openai_api_behavior: bool = True ) -> None: super().__init__( engine_client=engine_client, @@ -103,6 +104,8 @@ def __init__( self.chat_template = chat_template self.chat_template_content_format: Final = chat_template_content_format self.enable_log_outputs = enable_log_outputs + # This flag is used to determine whether to match OpenAI API behavior or return reasoning metadata and more tool information + self.enable_openai_api_behavior = enable_openai_api_behavior self.reasoning_parser: Optional[Callable[[AnyTokenizer], ReasoningParser]] = None @@ -617,9 +620,12 @@ def _make_response_output_items_with_harmony( output_items = [] num_init_messages = context.num_init_messages for msg in context.messages[num_init_messages:]: - output_items.extend(parse_output_message(msg)) + if self.enable_openai_api_behavior: + output_items.extend(parse_output_message_openai_api_behavior(msg)) + else: + output_items.extend(parse_output_message_verbose(msg, previous_output_items=output_items)) # Handle the generation stopped in the middle (if any). - last_items = parse_remaining_state(context.parser) + last_items = parse_remaining_state(context.parser, self.enable_openai_api_behavior) if last_items: output_items.extend(last_items) return output_items