|
4 | 4 | import asyncio
|
5 | 5 | import time
|
6 | 6 | from collections.abc import AsyncGenerator, AsyncIterator
|
| 7 | +from copy import copy |
7 | 8 | from http import HTTPStatus
|
8 | 9 | from typing import Callable, Final, Optional, Union
|
9 | 10 |
|
10 | 11 | import jinja2
|
11 | 12 | from fastapi import Request
|
12 |
| -from openai.types.responses import ResponseOutputMessage, ResponseOutputText |
| 13 | +from openai.types.responses import (ResponseFunctionToolCall, |
| 14 | + ResponseOutputMessage, ResponseOutputText) |
| 15 | +from openai_harmony import Message as OpenAIHarmonyMessage |
13 | 16 |
|
14 | 17 | from vllm import envs
|
15 | 18 | from vllm.config import ModelConfig
|
16 | 19 | from vllm.engine.protocol import EngineClient
|
17 | 20 | from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
|
18 | 21 | ChatTemplateContentFormatOption)
|
19 | 22 | from vllm.entrypoints.context import ConversationContext, SimpleContext
|
| 23 | +from vllm.entrypoints.harmony_utils import ( |
| 24 | + get_developer_message, get_stop_tokens_for_assistant_actions, |
| 25 | + get_system_message, get_user_message, parse_response_input, |
| 26 | + render_for_completion) |
20 | 27 | from vllm.entrypoints.logger import RequestLogger
|
21 | 28 | # yapf conflicts with isort for this block
|
22 | 29 | # yapf: disable
|
|
30 | 37 | from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
31 | 38 | from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
32 | 39 | from vllm.entrypoints.tool_server import ToolServer
|
| 40 | +from vllm.inputs.data import TokensPrompt as EngineTokensPrompt |
33 | 41 | from vllm.logger import init_logger
|
34 | 42 | from vllm.reasoning import ReasoningParser, ReasoningParserManager
|
35 | 43 | from vllm.sampling_params import SamplingParams
|
@@ -103,6 +111,29 @@ def __init__(
|
103 | 111 | "`VLLM_ENABLE_RESPONSES_API_STORE` is enabled. This may "
|
104 | 112 | "cause a memory leak since we never remove responses from "
|
105 | 113 | "the store.")
|
| 114 | + |
| 115 | + self.use_harmony = model_config.hf_config.model_type == "gpt_oss" |
| 116 | + if self.use_harmony: |
| 117 | + logger.warning("For gpt-oss, we ignore --enable-auto-tool-choice " |
| 118 | + "and always enable tool use.") |
| 119 | + # OpenAI models have two EOS-like tokens: <|return|> and <|call|>. |
| 120 | + # We need to add them to the stop token ids. |
| 121 | + if "stop_token_ids" not in self.default_sampling_params: |
| 122 | + self.default_sampling_params["stop_token_ids"] = [] |
| 123 | + self.default_sampling_params["stop_token_ids"].extend( |
| 124 | + get_stop_tokens_for_assistant_actions()) |
| 125 | + |
| 126 | + # set up tool use |
| 127 | + self.enable_auto_tools: bool = enable_auto_tools |
| 128 | + if self.enable_auto_tools: |
| 129 | + logger.info( |
| 130 | + "\"auto\" tool choice has been enabled please note that while" |
| 131 | + " the parallel_tool_calls client option is preset for " |
| 132 | + "compatibility reasons, it will be ignored.") |
| 133 | + if not self.use_harmony: |
| 134 | + raise NotImplementedError("Auto tool choice is not supported " |
| 135 | + "yet unless using Harmony") |
| 136 | + |
106 | 137 | # HACK(woosuk): This is a hack. We should use a better store.
|
107 | 138 | # FIXME: If enable_store=True, this may cause a memory leak since we
|
108 | 139 | # never remove responses from the store.
|
@@ -165,21 +196,20 @@ async def create_responses(
|
165 | 196 | return self._make_not_found_error(prev_response_id)
|
166 | 197 | else:
|
167 | 198 | prev_response = None
|
168 |
| - # Construct the input messages. |
169 |
| - messages = self._construct_input_messages(request, prev_response) |
170 | 199 |
|
171 | 200 | try:
|
172 | 201 | lora_request = self._maybe_get_adapters(request)
|
173 | 202 | model_name = self._get_model_name(request.model, lora_request)
|
174 | 203 | tokenizer = await self.engine_client.get_tokenizer(lora_request)
|
175 | 204 |
|
176 |
| - _, request_prompts, engine_prompts = await self._preprocess_chat( |
177 |
| - request, |
178 |
| - tokenizer, |
179 |
| - messages, |
180 |
| - chat_template=self.chat_template, |
181 |
| - chat_template_content_format=self.chat_template_content_format, |
182 |
| - ) |
| 205 | + if self.use_harmony: |
| 206 | + messages, request_prompts, engine_prompts = ( |
| 207 | + self._make_request_with_harmony(request, prev_response)) |
| 208 | + else: |
| 209 | + messages, request_prompts, engine_prompts = ( |
| 210 | + await self._make_request(request, prev_response, |
| 211 | + tokenizer)) |
| 212 | + |
183 | 213 | except (ValueError, TypeError, RuntimeError,
|
184 | 214 | jinja2.TemplateError) as e:
|
185 | 215 | logger.exception("Error in preprocessing prompt inputs")
|
@@ -275,6 +305,38 @@ async def create_responses(
|
275 | 305 | except Exception as e:
|
276 | 306 | return self.create_error_response(str(e))
|
277 | 307 |
|
| 308 | + async def _make_request( |
| 309 | + self, |
| 310 | + request: ResponsesRequest, |
| 311 | + prev_response: Optional[ResponsesResponse], |
| 312 | + tokenizer: AnyTokenizer, |
| 313 | + ): |
| 314 | + # Construct the input messages. |
| 315 | + messages = self._construct_input_messages(request, prev_response) |
| 316 | + _, request_prompts, engine_prompts = await self._preprocess_chat( |
| 317 | + request, |
| 318 | + tokenizer, |
| 319 | + messages, |
| 320 | + chat_template=self.chat_template, |
| 321 | + chat_template_content_format=self.chat_template_content_format, |
| 322 | + ) |
| 323 | + return messages, request_prompts, engine_prompts |
| 324 | + |
| 325 | + def _make_request_with_harmony( |
| 326 | + self, |
| 327 | + request: ResponsesRequest, |
| 328 | + prev_response: Optional[ResponsesResponse], |
| 329 | + ): |
| 330 | + if request.tool_choice != "auto": |
| 331 | + raise NotImplementedError( |
| 332 | + "Only 'auto' tool_choice is supported in " |
| 333 | + "response API with Harmony") |
| 334 | + messages = self._construct_input_messages_with_harmony( |
| 335 | + request, prev_response) |
| 336 | + prompt_token_ids = render_for_completion(messages) |
| 337 | + engine_prompt = EngineTokensPrompt(prompt_token_ids=prompt_token_ids) |
| 338 | + return messages, [prompt_token_ids], [engine_prompt] |
| 339 | + |
278 | 340 | async def responses_full_generator(
|
279 | 341 | self,
|
280 | 342 | request: ResponsesRequest,
|
@@ -411,6 +473,82 @@ def _construct_input_messages(
|
411 | 473 | messages.extend(request.input) # type: ignore
|
412 | 474 | return messages
|
413 | 475 |
|
| 476 | + def _construct_input_messages_with_harmony( |
| 477 | + self, |
| 478 | + request: ResponsesRequest, |
| 479 | + prev_response: Optional[ResponsesResponse], |
| 480 | + ) -> list[OpenAIHarmonyMessage]: |
| 481 | + messages: list[OpenAIHarmonyMessage] = [] |
| 482 | + if prev_response is None: |
| 483 | + # New conversation. |
| 484 | + reasoning_effort = (request.reasoning.effort |
| 485 | + if request.reasoning else None) |
| 486 | + tool_types = [tool.type for tool in request.tools] |
| 487 | + enable_browser = ("web_search_preview" in tool_types |
| 488 | + and self.tool_server is not None |
| 489 | + and self.tool_server.has_tool("browser")) |
| 490 | + enable_code_interpreter = ("code_interpreter" in tool_types |
| 491 | + and self.tool_server is not None |
| 492 | + and self.tool_server.has_tool("python")) |
| 493 | + sys_msg = get_system_message( |
| 494 | + reasoning_effort=reasoning_effort, |
| 495 | + browser_description=self.tool_server.get_tool_description( |
| 496 | + "browser") |
| 497 | + if enable_browser and self.tool_server is not None else None, |
| 498 | + python_description=self.tool_server.get_tool_description( |
| 499 | + "python") if enable_code_interpreter |
| 500 | + and self.tool_server is not None else None, |
| 501 | + ) |
| 502 | + messages.append(sys_msg) |
| 503 | + dev_msg = get_developer_message(request.instructions, |
| 504 | + request.tools) |
| 505 | + messages.append(dev_msg) |
| 506 | + else: |
| 507 | + # Continue the previous conversation. |
| 508 | + # FIXME(woosuk): Currently, request params like reasoning and |
| 509 | + # instructions are ignored. |
| 510 | + prev_msgs = self.msg_store[prev_response.id] |
| 511 | + # Remove the previous chain-of-thoughts if there is a new "final" |
| 512 | + # message. Note that this also removes these messages from the |
| 513 | + # msg_store. |
| 514 | + if len(prev_msgs) > 0: |
| 515 | + last_msg = prev_msgs[-1] |
| 516 | + assert isinstance(last_msg, OpenAIHarmonyMessage) |
| 517 | + if last_msg.channel == "final": |
| 518 | + prev_final_msg_idx = -1 |
| 519 | + for i in range(len(prev_msgs) - 2, -1, -1): |
| 520 | + prev_msg_i = prev_msgs[i] |
| 521 | + assert isinstance(prev_msg_i, OpenAIHarmonyMessage) |
| 522 | + if prev_msg_i.channel == "final": |
| 523 | + prev_final_msg_idx = i |
| 524 | + break |
| 525 | + recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1:] |
| 526 | + del prev_msgs[prev_final_msg_idx + 1:] |
| 527 | + for msg in recent_turn_msgs: |
| 528 | + assert isinstance(msg, OpenAIHarmonyMessage) |
| 529 | + if msg.channel != "analysis": |
| 530 | + prev_msgs.append(msg) |
| 531 | + messages.extend(prev_msgs) |
| 532 | + # Append the new input. |
| 533 | + # Reponses API supports simple text inputs without chat format. |
| 534 | + if isinstance(request.input, str): |
| 535 | + messages.append(get_user_message(request.input)) |
| 536 | + else: |
| 537 | + if prev_response is not None: |
| 538 | + prev_outputs = copy(prev_response.output) |
| 539 | + else: |
| 540 | + prev_outputs = [] |
| 541 | + for response_msg in request.input: |
| 542 | + messages.append( |
| 543 | + parse_response_input(response_msg, prev_outputs)) |
| 544 | + # User passes in a a tool call request and its output. We need |
| 545 | + # to add the tool call request to prev_outputs so that the |
| 546 | + # parse_response_input can find the tool call request when |
| 547 | + # parsing the tool call output. |
| 548 | + if isinstance(response_msg, ResponseFunctionToolCall): |
| 549 | + prev_outputs.append(response_msg) |
| 550 | + return messages |
| 551 | + |
414 | 552 | async def _run_background_request(
|
415 | 553 | self,
|
416 | 554 | request: ResponsesRequest,
|
|
0 commit comments