alecsolder · alecsolder · Sep 4, 2025
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
@@ -9,7 +9,7 @@
                                     ResponseOutputItem, ResponseOutputMessage,
                                     ResponseOutputText, ResponseReasoningItem)
 from openai.types.responses.response_function_web_search import (
-    ActionFind, ActionOpenPage, ActionSearch, ResponseFunctionWebSearch)
+    ActionFind, ActionOpenPage, ActionSearch, McpCall, ResponseFunctionWebSearch)
 from openai.types.responses.response_reasoning_item import (
     Content as ResponseReasoningTextContent)
 from openai.types.responses.tool import Tool
@@ -18,7 +18,7 @@
                             Role, StreamableParser, SystemContent, TextContent,
                             ToolDescription, load_harmony_encoding)
 
-from vllm.entrypoints.openai.protocol import ResponseInputOutputItem
+from vllm.entrypoints.openai.protocol import ResponseInputOutputItem, MessageMetadata, ResponseOutputItemWithMetadata
 from vllm.utils import random_uuid
 
 REASONING_EFFORT = {
@@ -166,8 +166,7 @@ def render_for_completion(messages: list[Message]) -> list[int]:
         conversation, Role.ASSISTANT)
     return token_ids
 
-
-def parse_output_message(message: Message) -> list[ResponseOutputItem]:
+def parse_output_message_openai_api_behavior(message: Message) -> list[ResponseOutputItem]:
     """
     Parse a Harmony message into a list of output response items.
     """
@@ -268,9 +267,102 @@ def parse_output_message(message: Message) -> list[ResponseOutputItem]:
         raise ValueError(f"Unknown channel: {message.channel}")
     return output_items
 
+def parse_output_message_verbose(message: Message, previous_output_items: list[ResponseOutputItemWithMetadata]) -> list[ResponseOutputItemWithMetadata]:
+    output_items: list[ResponseOutputItem] = []
+    recipient = message.recipient
+    message_metadata = MessageMetadata(
+        author=str(message.author),
+        channel=message.channel,
+        recipient=message.recipient,
+        content_type=message.content_type
+    )
+    # When recipient is not None, it is always a tool call or tool call output
+    if recipient is not None:
+        # This means that it is a tool call output of some sort
+        if recipient == "assistant":
+            mcp_call = None
+            # TODO: Support parallel tool calls
+            for prev_response in reversed(previous_output_items):
+                item = prev_response.item
+                if isinstance(item, McpCall):
+                    mcp_call = item
+                    break
+            if mcp_call is None:
+                raise ValueError("Received a tool call output without a prior tool call")
+            mcp_call.output = message.content[0].text_content
+            # No need to append any message since we are modifying in place
+        # Currently assuming that any tool call recipient that starts with functions is executed client side, so is a ResponseFunctionToolCall
+        elif recipient.startswith('functions'):
+            function_name = recipient.split(".")[-1]
+            for content in message.content:
+                random_id = random_uuid()
+                response_item = ResponseFunctionToolCall(
+                    arguments=content.text,
+                    call_id=f"call_{random_id}",
+                    type="function_call",
+                    name=function_name,
+                    id=f"ft_{random_id}",
+                )
+                output_items.append(response_item)
+        else:
+            for content in message.content:
+                random_id = random_uuid()
+                response_item = McpCall(
+                    arguments=content.text,
+                    id=f"call_{random_id}",
+                    type="mcp_call",
+                    name=recipient,
+                    server_label=recipient.split('.')[0]
+                )
+                output_items.append(response_item)
+    # Any messages without recipient to these channels is a reasoning message
+    elif message.channel == "analysis" or message.channel == "commentary":
+        for content in message.content:
+            reasoning_item = ResponseReasoningItem(
+                id=f"rs_{random_uuid()}",
+                summary=[],
+                type="reasoning",
+                content=[
+                    ResponseReasoningTextContent(text=content.text,
+                                                 type="reasoning_text")
+                ],
+                status=None,
+            )
+            output_items.append(reasoning_item)
+    # Final channel means this is not reasoning and is a message to show to the user
+    elif message.channel == "final":
+        contents = []
+        for content in message.content:
+            output_text = ResponseOutputText(
+                text=content.text,
+                annotations=[],  # TODO
+                type="output_text",
+                logprobs=None,  # TODO
+            )
+            contents.append(output_text)
+        text_item = ResponseOutputMessage(
+            id=f"msg_{random_uuid()}",
+            content=contents,
+            role=message.author.role,
+            status="completed",
+            type="message",
+        )
+        output_items.append(text_item)
+    else:
+        raise ValueError(f"Unknown channel: {message.channel}")
+
+    # Convert each ResponseOutputItem to ResponseOutputItemWithMetadata
+    result = []
+    for item in output_items:
+        result.append(ResponseOutputItemWithMetadata(
+            item=item,
+            metadata=message_metadata
+        ))
+    return result
+
 
 def parse_remaining_state(
-        parser: StreamableParser) -> list[ResponseOutputItem]:
+        parser: StreamableParser, verbose: bool = False) -> Union[list[ResponseOutputItem], list[ResponseOutputItemWithMetadata]]:
     if not parser.current_content:
         return []
     if parser.current_role != Role.ASSISTANT:
@@ -280,6 +372,8 @@ def parse_remaining_state(
             and current_recipient.startswith("browser.")):
         return []
 
+    output_items = []
+
     if parser.current_channel == "analysis":
         reasoning_item = ResponseReasoningItem(
             id=f"rs_{random_uuid()}",
@@ -291,7 +385,7 @@ def parse_remaining_state(
             ],
             status=None,
         )
-        return [reasoning_item]
+        output_items = [reasoning_item]
     elif parser.current_channel == "final":
         output_text = ResponseOutputText(
             text=parser.current_content,
@@ -306,8 +400,17 @@ def parse_remaining_state(
             status="completed",
             type="message",
         )
-        return [text_item]
-    return []
+        output_items = [text_item]
+    if verbose:
+        message_metadata = MessageMetadata(author=Author(role=parser.current_role),
+            channel=parser.current_channel,
+            recipient=parser.current_recipient,
+            content_type=parser.current_content_type)
+        return [ResponseOutputItemWithMetadata(
+            item=output_items[0],
+            metadata=message_metadata
+        )]
+    return output_items
 
 
 def get_stop_tokens_for_assistant_actions() -> list[int]:

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -1843,6 +1843,16 @@ class ResponseUsage(OpenAIBaseModel):
     output_tokens_details: OutputTokensDetails
     total_tokens: int
 
+class MessageMetadata(BaseModel):
+    author: Optional[str] = None
+    channel: Optional[str] = None
+    recipient: Optional[str] = None
+    content_type: Optional[str] = None
+
+class ResponseOutputItemWithMetadata(BaseModel):
+    item: ResponseOutputItem
+    metadata: MessageMetadata
+
 
 class ResponsesResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"resp_{random_uuid()}")
@@ -1853,7 +1863,7 @@ class ResponsesResponse(OpenAIBaseModel):
     metadata: Optional[Metadata] = None
     model: str
     object: Literal["response"] = "response"
-    output: list[ResponseOutputItem]
+    output: Union[list[ResponseOutputItem], list[ResponseOutputItemWithMetadata]]
     parallel_tool_calls: bool
     temperature: float
     tool_choice: ToolChoice
@@ -1880,7 +1890,7 @@ def from_request(
         sampling_params: SamplingParams,
         model_name: str,
         created_time: int,
-        output: list[ResponseOutputItem],
+        output: Union[list[ResponseOutputItem], list[ResponseOutputItemWithMetadata]],
         status: ResponseStatus,
         usage: Optional[ResponseUsage] = None,
     ) -> "ResponsesResponse":
@@ -2089,7 +2099,7 @@ class DetokenizeResponse(OpenAIBaseModel):
 
 class TokenizerInfoResponse(OpenAIBaseModel):
     """
-    Response containing tokenizer configuration 
+    Response containing tokenizer configuration
     equivalent to tokenizer_config.json
     """
 
@@ -2179,7 +2189,7 @@ class TranscriptionRequest(OpenAIBaseModel):
     to_language: Optional[str] = None
     """The language of the output audio we transcribe to.
 
-    Please note that this is not currently used by supported models at this 
+    Please note that this is not currently used by supported models at this
     time, but it is a placeholder for future use, matching translation api.
     """
 

diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
@@ -41,7 +41,7 @@
                                       SimpleContext, StreamingHarmonyContext)
 from vllm.entrypoints.harmony_utils import (
     get_developer_message, get_stop_tokens_for_assistant_actions,
-    get_system_message, get_user_message, parse_output_message,
+    get_system_message, get_user_message, parse_output_message_openai_api_behavior, parse_output_message_verbose,
     parse_remaining_state, parse_response_input, render_for_completion)
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
@@ -89,6 +89,7 @@ def __init__(
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
         log_error_stack: bool = False,
+        enable_openai_api_behavior: bool = True
     ) -> None:
         super().__init__(
             engine_client=engine_client,
@@ -103,6 +104,8 @@ def __init__(
         self.chat_template = chat_template
         self.chat_template_content_format: Final = chat_template_content_format
         self.enable_log_outputs = enable_log_outputs
+        # This flag is used to determine whether to match OpenAI API behavior or return reasoning metadata and more tool information
+        self.enable_openai_api_behavior = enable_openai_api_behavior
 
         self.reasoning_parser: Optional[Callable[[AnyTokenizer],
                                                  ReasoningParser]] = None
@@ -617,9 +620,12 @@ def _make_response_output_items_with_harmony(
         output_items = []
         num_init_messages = context.num_init_messages
         for msg in context.messages[num_init_messages:]:
-            output_items.extend(parse_output_message(msg))
+            if self.enable_openai_api_behavior:
+                output_items.extend(parse_output_message_openai_api_behavior(msg))
+            else:
+                output_items.extend(parse_output_message_verbose(msg, previous_output_items=output_items))
         # Handle the generation stopped in the middle (if any).
-        last_items = parse_remaining_state(context.parser)
+        last_items = parse_remaining_state(context.parser, self.enable_openai_api_behavior)
         if last_items:
             output_items.extend(last_items)
         return output_items