diff --git a/tests/entrypoints/openai/test_response_api_harmony_input_output.py b/tests/entrypoints/openai/test_response_api_harmony_input_output.py new file mode 100644 index 000000000000..91f88199889f --- /dev/null +++ b/tests/entrypoints/openai/test_response_api_harmony_input_output.py @@ -0,0 +1,378 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import httpx +import pytest +import pytest_asyncio +from openai import OpenAI + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "openai/gpt-oss-20b" + + +@pytest.fixture(scope="module") +def monkeypatch_module(): + from _pytest.monkeypatch import MonkeyPatch + mpatch = MonkeyPatch() + yield mpatch + mpatch.undo() + + +@pytest.fixture(scope="module") +def server(monkeypatch_module: pytest.MonkeyPatch): + args = ["--enforce-eager", "--tool-server", "demo"] + + with monkeypatch_module.context() as m: + m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1") + m.setenv("VLLM_RESPONSES_API_ENABLE_HARMONY_MESSAGES_OUTPUT", "1") + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server): + async with server.get_async_client() as async_client: + yield async_client + + +async def send_harmony_request(server, data: dict) -> dict: + """Helper function to send requests with harmony messages using HTTP.""" + async with httpx.AsyncClient(timeout=120.0) as http_client: + response = await http_client.post( + f"{server.url_root}/v1/responses", + json=data, + headers={"Authorization": f"Bearer {server.DUMMY_API_KEY}"}) + response.raise_for_status() + return response.json() + + +class HarmonyResponse: + """Helper class to make HTTP response look like OpenAI client response.""" + + def __init__(self, data: dict): + self.status = data["status"] + self.input_harmony_messages = data.get("input_harmony_messages", []) + self.output_harmony_messages = data.get("output_harmony_messages", []) + self.id = data.get("id") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_harmony_message_deserialization(client: OpenAI, model_name: str, + server): + """Test that harmony messages can be properly deserialized from JSON.""" + + # Create some harmony messages manually (as they would come from a client) + previous_harmony_messages = [{ + "role": + "user", + "content": [{ + "type": "text", + "text": "What is the capital of France?" + }], + "channel": + None, + "recipient": + None, + "content_type": + None + }, { + "role": + "assistant", + "content": [{ + "type": "text", + "text": "The capital of France is Paris." + }], + "channel": + None, + "recipient": + None, + "content_type": + None + }] + + # Use direct HTTP request since OpenAI client doesn't support custom params + response_json = await send_harmony_request( + server, { + "model": model_name, + "input": "Tell me more about that city.", + "instructions": "Use the previous conversation context.", + "previous_response_harmony_messages": previous_harmony_messages + }) + + response = HarmonyResponse(response_json) + + assert response is not None + assert response.status == "completed" + + # Verify the response includes both the previous and new messages + all_messages = (response.input_harmony_messages + + response.output_harmony_messages) + + # Verify that all messages have proper serialization + all_messages = (response.input_harmony_messages + + response.output_harmony_messages) + for msg in all_messages: + assert "role" in msg + assert "content" in msg + assert isinstance(msg["content"], list) + + # Ensure content is not empty objects + for content_item in msg["content"]: + assert isinstance(content_item, dict) + assert len(content_item) > 0 # Should not be empty {} + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_harmony_message_round_trip(client: OpenAI, model_name: str, + server): + """Test full round-trip: get harmony messages from response, send back.""" + + # First request using standard OpenAI client + response1 = await client.responses.create( + model=model_name, + input="What is 2 + 2?", + instructions="Provide a simple answer.") + + assert response1 is not None + assert response1.status == "completed" + + # Extract harmony messages from first response + first_input_messages = response1.input_harmony_messages + first_output_messages = response1.output_harmony_messages + + # Combine all messages from first conversation + all_first_messages = first_input_messages + first_output_messages + + # Second request using harmony messages from first response - use HTTP + response2_json = await send_harmony_request( + server, { + "model": model_name, + "input": "Now what is 3 + 3?", + "instructions": "Continue the math conversation.", + "previous_response_harmony_messages": all_first_messages + }) + + response2 = HarmonyResponse(response2_json) + + assert response2 is not None + assert response2.status == "completed" + + # Verify that second response contains more messages (original + new) + second_input_messages = response2.input_harmony_messages + second_output_messages = response2.output_harmony_messages + + # Should have at least the messages from the first conversation plus new + assert len(second_input_messages) > len(first_input_messages) + + # Verify all messages in the full conversation have proper content + all_second_messages = second_input_messages + second_output_messages + text_message_count = 0 + + for msg in all_second_messages: + assert "role" in msg + assert "content" in msg + + for content_item in msg["content"]: + if content_item.get("type") == "text": + assert "text" in content_item + assert len(content_item["text"].strip()) > 0 + text_message_count += 1 + + # Should have at least some text messages in the conversation + assert text_message_count > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_harmony_message_context_continuation(client: OpenAI, + model_name: str, server): + """Test that harmony messages provide proper context continuation.""" + + # First establish context with a specific topic + response1_json = await send_harmony_request( + server, { + "model": model_name, + "input": + "I'm planning a trip to Tokyo. What's the best time to visit?", + "instructions": "Provide travel advice." + }) + + response1 = HarmonyResponse(response1_json) + assert response1.status == "completed" + + # Get all messages from the first conversation + all_messages = (response1.input_harmony_messages + + response1.output_harmony_messages) + + # Continue the conversation with a follow-up question + response2_json = await send_harmony_request( + server, { + "model": model_name, + "input": "What about food recommendations for that city?", + "instructions": "Continue helping with travel planning.", + "previous_response_harmony_messages": all_messages + }) + + response2 = HarmonyResponse(response2_json) + assert response2.status == "completed" + + # Verify context is maintained - should have more messages now + assert len(response2.input_harmony_messages) > len( + response1.input_harmony_messages) + + # The conversation should contain references to the original topic + all_content = [] + for msg in (response2.input_harmony_messages + + response2.output_harmony_messages): + for content_item in msg["content"]: + if content_item.get("type") == "text" and "text" in content_item: + all_content.append(content_item["text"].lower()) + + # Should contain references to the original context (Tokyo/trip) + conversation_text = " ".join(all_content) + assert ("tokyo" in conversation_text or "trip" in conversation_text + or "travel" in conversation_text) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_harmony_message_empty_list(client: OpenAI, model_name: str, + server): + """Test that empty harmony messages list works properly.""" + + response_json = await send_harmony_request( + server, + { + "model": model_name, + "input": "What's 5 + 5?", + "instructions": "Answer the math question.", + "previous_response_harmony_messages": [] # Empty list + }) + + response = HarmonyResponse(response_json) + assert response.status == "completed" + assert len(response.input_harmony_messages) > 0 + assert len(response.output_harmony_messages) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_harmony_message_none_parameter(client: OpenAI, model_name: str, + server): + """Test that None harmony messages parameter works (same as omitting).""" + + response_json = await send_harmony_request( + server, { + "model": model_name, + "input": "What's 7 + 8?", + "instructions": "Answer the math question.", + "previous_response_harmony_messages": None + }) + + response = HarmonyResponse(response_json) + assert response.status == "completed" + assert len(response.input_harmony_messages) > 0 + assert len(response.output_harmony_messages) > 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_harmony_message_validation_error(client: OpenAI, + model_name: str, server): + """Test that malformed harmony messages produce validation errors.""" + + # Test with invalid harmony message structure + invalid_harmony_messages = [{ + "role": "user", + # Missing required "content" field + "channel": None, + "recipient": None, + "content_type": None + }] + + async with httpx.AsyncClient(timeout=120.0) as http_client: + response = await http_client.post( + f"{server.url_root}/v1/responses", + json={ + "model": model_name, + "input": "Hello", + "previous_response_harmony_messages": invalid_harmony_messages + }, + headers={"Authorization": f"Bearer {server.DUMMY_API_KEY}"}) + + # Should get an error (4xx or 5xx status code due to missing content) + assert response.status_code >= 400 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_harmony_message_chain_conversation(client: OpenAI, + model_name: str, server): + """Test chaining multiple requests with harmony messages.""" + + # Start a conversation + response1_json = await send_harmony_request( + server, { + "model": model_name, + "input": "My favorite color is blue.", + "instructions": "Remember this information." + }) + + response1 = HarmonyResponse(response1_json) + assert response1.status == "completed" + + # Continue with context from first response + messages_after_1 = (response1.input_harmony_messages + + response1.output_harmony_messages) + + response2_json = await send_harmony_request( + server, { + "model": model_name, + "input": "What's my favorite color?", + "instructions": "Use the previous context.", + "previous_response_harmony_messages": messages_after_1 + }) + + response2 = HarmonyResponse(response2_json) + assert response2.status == "completed" + + # Continue with context from second response + messages_after_2 = (response2.input_harmony_messages + + response2.output_harmony_messages) + + response3_json = await send_harmony_request( + server, { + "model": model_name, + "input": "What about my favorite number? It's 42.", + "instructions": "Remember this new information too.", + "previous_response_harmony_messages": messages_after_2 + }) + + response3 = HarmonyResponse(response3_json) + assert response3.status == "completed" + + # Final request should have context from all previous messages + messages_after_3 = (response3.input_harmony_messages + + response3.output_harmony_messages) + + response4_json = await send_harmony_request( + server, { + "model": model_name, + "input": "What are my favorite color and number?", + "instructions": "Recall both pieces of information.", + "previous_response_harmony_messages": messages_after_3 + }) + + response4 = HarmonyResponse(response4_json) + assert response4.status == "completed" + + # Verify the conversation has grown with each interaction + assert len(response4.input_harmony_messages) > len( + response3.input_harmony_messages) + assert len(response3.input_harmony_messages) > len( + response2.input_harmony_messages) + assert len(response2.input_harmony_messages) > len( + response1.input_harmony_messages) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 9d587e866933..088807f123d3 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -75,6 +75,7 @@ def __init__( available_tools: list[str], ): self._messages = messages + self.input_messages = messages.copy() self.available_tools = available_tools self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {} diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 488102232562..fd38cf18f253 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -33,6 +33,7 @@ from openai.types.responses.response import ToolChoice from openai.types.responses.tool import Tool from openai.types.shared import Metadata, Reasoning +from openai_harmony import Message from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, ValidationInfo, field_validator, model_validator) from typing_extensions import TypeAlias @@ -274,6 +275,9 @@ class ResponsesRequest(OpenAIBaseModel): model: Optional[str] = None parallel_tool_calls: Optional[bool] = True previous_response_id: Optional[str] = None + # This can be used when the store is disabled but you want to + # be able to continue a Responses API thread + previous_response_harmony_messages: Optional[list[Message]] = None prompt: Optional[ResponsePrompt] = None reasoning: Optional[Reasoning] = None service_tier: Literal["auto", "default", "flex", "scale", @@ -374,6 +378,27 @@ def is_include_output_logprobs(self) -> bool: self.include, list) and "message.output_text.logprobs" in self.include + @field_validator("previous_response_harmony_messages", mode="before") + @classmethod + def deserialize_harmony_messages(cls, v): + """Convert incoming JSON dictionaries to Message objects.""" + if v is None: + return v + if isinstance(v, list): + result = [] + for item in v: + if isinstance(item, dict): + # Convert dictionary to Message object using from_dict + result.append(Message.from_dict(item)) + elif isinstance(item, Message): + # Already a Message object + result.append(item) + else: + raise ValueError( + f"Invalid harmony message type: {type(item)}") + return result + raise ValueError(f"Invalid type for harmony messages: {type(v)}") + @model_validator(mode="before") def validate_background(cls, data): if not data.get("background"): @@ -1847,6 +1872,10 @@ class ResponseUsage(OpenAIBaseModel): class ResponsesResponse(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") created_at: int = Field(default_factory=lambda: int(time.time())) + # These are populated when the env flag + # VLLM_RESPONSES_API_ENABLE_HARMONY_MESSAGES_OUTPUT is set + input_harmony_messages: Optional[list[dict[str, Any]]] = None + output_harmony_messages: Optional[list[dict[str, Any]]] = None # error: Optional[ResponseError] = None # incomplete_details: Optional[IncompleteDetails] = None instructions: Optional[str] = None @@ -1882,12 +1911,20 @@ def from_request( created_time: int, output: list[ResponseOutputItem], status: ResponseStatus, + input_harmony_messages: Optional[list[Message]] = None, + output_harmony_messages: Optional[list[Message]] = None, usage: Optional[ResponseUsage] = None, ) -> "ResponsesResponse": return cls( id=request.request_id, created_at=created_time, instructions=request.instructions, + input_harmony_messages=[ + msg.to_dict() for msg in input_harmony_messages + ] if input_harmony_messages else None, + output_harmony_messages=[ + msg.to_dict() for msg in output_harmony_messages + ] if output_harmony_messages else None, metadata=request.metadata, model=model_name, output=output, @@ -2089,7 +2126,7 @@ class DetokenizeResponse(OpenAIBaseModel): class TokenizerInfoResponse(OpenAIBaseModel): """ - Response containing tokenizer configuration + Response containing tokenizer configuration equivalent to tokenizer_config.json """ @@ -2179,7 +2216,7 @@ class TranscriptionRequest(OpenAIBaseModel): to_language: Optional[str] = None """The language of the output audio we transcribe to. - Please note that this is not currently used by supported models at this + Please note that this is not currently used by supported models at this time, but it is a placeholder for future use, matching translation api. """ diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 4c15de303099..1defbe6f9b87 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -439,9 +439,17 @@ async def responses_full_generator( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) + output_harmony_messages = None + input_harmony_messages = None if self.use_harmony: assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) + if envs.VLLM_RESPONSES_API_ENABLE_HARMONY_MESSAGES_OUTPUT: + # TODO: Handle leftover parser state? + input_harmony_messages = context.input_messages + # .messages contains input and output, so just get the output + output_harmony_messages = context.messages[ + len(input_harmony_messages):] # TODO: these are all 0 for now! num_prompt_tokens = context.num_prompt_tokens num_generated_tokens = context.num_output_tokens @@ -479,6 +487,8 @@ async def responses_full_generator( model_name=model_name, created_time=created_time, output=output, + input_harmony_messages=input_harmony_messages, + output_harmony_messages=output_harmony_messages, status="completed", usage=usage, ) @@ -666,31 +676,7 @@ def _construct_input_messages_with_harmony( prev_response: Optional[ResponsesResponse], ) -> list[OpenAIHarmonyMessage]: messages: list[OpenAIHarmonyMessage] = [] - if prev_response is None: - # New conversation. - reasoning_effort = (request.reasoning.effort - if request.reasoning else None) - tool_types = [tool.type for tool in request.tools] - enable_browser = ("web_search_preview" in tool_types - and self.tool_server is not None - and self.tool_server.has_tool("browser")) - enable_code_interpreter = ("code_interpreter" in tool_types - and self.tool_server is not None - and self.tool_server.has_tool("python")) - sys_msg = get_system_message( - reasoning_effort=reasoning_effort, - browser_description=self.tool_server.get_tool_description( - "browser") - if enable_browser and self.tool_server is not None else None, - python_description=self.tool_server.get_tool_description( - "python") if enable_code_interpreter - and self.tool_server is not None else None, - ) - messages.append(sys_msg) - dev_msg = get_developer_message(request.instructions, - request.tools) - messages.append(dev_msg) - else: + if prev_response is not None: # Continue the previous conversation. # FIXME(woosuk): Currently, request params like reasoning and # instructions are ignored. @@ -716,6 +702,32 @@ def _construct_input_messages_with_harmony( if msg.channel != "analysis": prev_msgs.append(msg) messages.extend(prev_msgs) + elif request.previous_response_harmony_messages is not None: + messages.extend(request.previous_response_harmony_messages) + else: + # New conversation. + reasoning_effort = (request.reasoning.effort + if request.reasoning else None) + tool_types = [tool.type for tool in request.tools] + enable_browser = ("web_search_preview" in tool_types + and self.tool_server is not None + and self.tool_server.has_tool("browser")) + enable_code_interpreter = ("code_interpreter" in tool_types + and self.tool_server is not None + and self.tool_server.has_tool("python")) + sys_msg = get_system_message( + reasoning_effort=reasoning_effort, + browser_description=self.tool_server.get_tool_description( + "browser") + if enable_browser and self.tool_server is not None else None, + python_description=self.tool_server.get_tool_description( + "python") if enable_code_interpreter + and self.tool_server is not None else None, + ) + messages.append(sys_msg) + dev_msg = get_developer_message(request.instructions, + request.tools) + messages.append(dev_msg) # Append the new input. # Reponses API supports simple text inputs without chat format. if isinstance(request.input, str): diff --git a/vllm/envs.py b/vllm/envs.py index 1232bd7bf963..85dbd3b89f47 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -168,6 +168,7 @@ VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False + VLLM_RESPONSES_API_ENABLE_HARMONY_MESSAGES_OUTPUT: bool = False def get_default_cache_root(): @@ -1199,6 +1200,12 @@ def get_vllm_port() -> Optional[int]: "VLLM_TUNED_CONFIG_FOLDER": lambda: os.getenv("VLLM_TUNED_CONFIG_FOLDER", None), + # Whether to enable outputting Harmony messages on the + # Responses API response object + "VLLM_RESPONSES_API_ENABLE_HARMONY_MESSAGES_OUTPUT": + lambda: bool(int( + os.getenv("VLLM_RESPONSES_API_ENABLE_HARMONY_MESSAGES_OUTPUT", + "0"))), } # --8<-- [end:env-vars-definition]