12
12
import partial_json_parser
13
13
import regex as re
14
14
from fastapi import Request
15
+ from openai_harmony import Message as OpenAIMessage
15
16
from pydantic import TypeAdapter
16
17
17
18
from vllm .config import ModelConfig
18
19
from vllm .engine .protocol import EngineClient
19
20
from vllm .entrypoints .chat_utils import (ChatTemplateContentFormatOption ,
20
21
ConversationMessage ,
21
22
random_tool_call_id )
23
+ from vllm .entrypoints .harmony_utils import (
24
+ get_developer_message , get_stop_tokens_for_assistant_actions ,
25
+ get_streamable_parser_for_assistant , get_system_message , parse_chat_input ,
26
+ parse_chat_output , render_for_completion )
22
27
from vllm .entrypoints .logger import RequestLogger
23
28
from vllm .entrypoints .openai .protocol import (
24
29
ChatCompletionLogProb , ChatCompletionLogProbs ,
35
40
from vllm .entrypoints .openai .tool_parsers .mistral_tool_parser import (
36
41
MistralToolCall )
37
42
from vllm .entrypoints .utils import get_max_tokens
43
+ from vllm .inputs .data import TokensPrompt as EngineTokensPrompt
38
44
from vllm .logger import init_logger
39
45
from vllm .outputs import CompletionOutput , RequestOutput
40
46
from vllm .reasoning import ReasoningParser , ReasoningParserManager
@@ -125,6 +131,23 @@ def __init__(
125
131
logger .info ("Using default chat sampling params from %s: %s" ,
126
132
source , self .default_sampling_params )
127
133
134
+ self .use_harmony = model_config .hf_config .model_type == "gpt_oss"
135
+ if self .use_harmony :
136
+ if "stop_token_ids" not in self .default_sampling_params :
137
+ self .default_sampling_params ["stop_token_ids" ] = []
138
+ self .default_sampling_params ["stop_token_ids" ].extend (
139
+ get_stop_tokens_for_assistant_actions ())
140
+
141
+ # NOTE(woosuk): While OpenAI's chat completion API supports browsing
142
+ # for some models, currently vLLM doesn't support it. Please use the
143
+ # Responses API instead.
144
+ self .supports_browsing = False
145
+ self .browser_tool = None
146
+ # NOTE(woosuk): Chat completion API does not support code interpreter.
147
+ # Please use the Responses API instead.
148
+ self .supports_code_interpreter = False
149
+ self .python_tool = None
150
+
128
151
async def create_chat_completion (
129
152
self ,
130
153
request : ChatCompletionRequest ,
@@ -169,7 +192,8 @@ async def create_chat_completion(
169
192
170
193
if (request .tool_choice == "auto" and
171
194
not (self .enable_auto_tools and tool_parser is not None )
172
- and not isinstance (tokenizer , MistralTokenizer )):
195
+ and not isinstance (tokenizer , MistralTokenizer )
196
+ and not self .use_harmony ):
173
197
# for hf tokenizers, "auto" tools requires
174
198
# --enable-auto-tool-choice and --tool-call-parser
175
199
return self .create_error_response (
@@ -184,25 +208,35 @@ async def create_chat_completion(
184
208
else :
185
209
tool_dicts = [tool .model_dump () for tool in request .tools ]
186
210
187
- (
188
- conversation ,
189
- request_prompts ,
190
- engine_prompts ,
191
- ) = await self ._preprocess_chat (
192
- request ,
193
- tokenizer ,
194
- request .messages ,
195
- chat_template = request .chat_template or self .chat_template ,
196
- chat_template_content_format = self .chat_template_content_format ,
197
- add_generation_prompt = request .add_generation_prompt ,
198
- continue_final_message = request .continue_final_message ,
199
- tool_dicts = tool_dicts ,
200
- documents = request .documents ,
201
- chat_template_kwargs = request .chat_template_kwargs ,
202
- tool_parser = tool_parser ,
203
- truncate_prompt_tokens = request .truncate_prompt_tokens ,
204
- add_special_tokens = request .add_special_tokens ,
205
- )
211
+ if not self .use_harmony :
212
+ # Common case.
213
+ (
214
+ conversation ,
215
+ request_prompts ,
216
+ engine_prompts ,
217
+ ) = await self ._preprocess_chat (
218
+ request ,
219
+ tokenizer ,
220
+ request .messages ,
221
+ chat_template = request .chat_template or self .chat_template ,
222
+ chat_template_content_format = self .
223
+ chat_template_content_format ,
224
+ add_generation_prompt = request .add_generation_prompt ,
225
+ continue_final_message = request .continue_final_message ,
226
+ tool_dicts = tool_dicts ,
227
+ documents = request .documents ,
228
+ chat_template_kwargs = request .chat_template_kwargs ,
229
+ tool_parser = tool_parser ,
230
+ truncate_prompt_tokens = request .truncate_prompt_tokens ,
231
+ add_special_tokens = request .add_special_tokens ,
232
+ )
233
+ else :
234
+ # For GPT-OSS.
235
+ (
236
+ conversation ,
237
+ request_prompts ,
238
+ engine_prompts ,
239
+ ) = self ._make_request_with_harmony (request )
206
240
except (ValueError , TypeError , RuntimeError ,
207
241
jinja2 .TemplateError ) as e :
208
242
logger .exception ("Error in preprocessing prompt inputs" )
@@ -436,6 +470,11 @@ async def chat_completion_stream_generator(
436
470
finish_reason_sent = [False ] * num_choices
437
471
num_prompt_tokens = 0
438
472
num_cached_tokens = None
473
+ if self .use_harmony :
474
+ harmony_parsers = [
475
+ get_streamable_parser_for_assistant ()
476
+ for _ in range (num_choices )
477
+ ]
439
478
440
479
if isinstance (request .tool_choice , ChatCompletionNamedToolChoiceParam ):
441
480
tool_choice_function_name = request .tool_choice .function .name
@@ -597,7 +636,18 @@ async def chat_completion_stream_generator(
597
636
else :
598
637
logprobs = None
599
638
600
- delta_text = output .text
639
+ if self .use_harmony :
640
+ harmony_parser = harmony_parsers [i ]
641
+ for token_id in output .token_ids :
642
+ harmony_parser .process (token_id )
643
+ # FIXME(woosuk): Support function calling
644
+ is_final = harmony_parser .current_channel == "final"
645
+ if not (request .include_reasoning or is_final ):
646
+ # Skip the reasoning content.
647
+ continue
648
+ delta_text = harmony_parser .last_content_delta or ""
649
+ else :
650
+ delta_text = output .text
601
651
602
652
if not delta_text and not output .token_ids and \
603
653
not previous_num_tokens [i ]:
@@ -607,7 +657,8 @@ async def chat_completion_stream_generator(
607
657
delta_message : Optional [DeltaMessage ]
608
658
609
659
# just update previous_texts and previous_token_ids
610
- if tool_choice_auto or self .reasoning_parser :
660
+ if ((tool_choice_auto or self .reasoning_parser )
661
+ and not self .use_harmony ):
611
662
assert previous_texts is not None
612
663
assert all_previous_token_ids is not None
613
664
previous_text = previous_texts [i ]
@@ -621,8 +672,14 @@ async def chat_completion_stream_generator(
621
672
else :
622
673
current_token_ids = list (output .token_ids )
623
674
675
+ if self .use_harmony :
676
+ if is_final :
677
+ delta_message = DeltaMessage (content = delta_text )
678
+ else :
679
+ delta_message = DeltaMessage (
680
+ reasoning_content = delta_text )
624
681
# handle streaming deltas for tools with named tool_choice
625
- if tool_choice_function_name :
682
+ elif tool_choice_function_name :
626
683
if (self .reasoning_parser and not reasoning_end_arr [i ]
627
684
and not reasoning_parser .is_reasoning_end (
628
685
previous_token_ids )):
@@ -990,7 +1047,38 @@ async def chat_completion_full_generator(
990
1047
)
991
1048
else :
992
1049
logprobs = None
993
- auto_tools_called = False
1050
+
1051
+ if self .use_harmony :
1052
+ reasoning_content , final_content , is_tool_call = (
1053
+ parse_chat_output (token_ids ))
1054
+ if not request .include_reasoning :
1055
+ reasoning_content = None
1056
+
1057
+ if is_tool_call :
1058
+ # TODO(woosuk): Implement tool call for gpt-oss.
1059
+ # For now, only Responses API supports tool call for
1060
+ # gpt-oss.
1061
+ raise NotImplementedError (
1062
+ "Tool call in Chat Completion API is not supported "
1063
+ "for gpt-oss yet. Please use Responses API instead." )
1064
+ else :
1065
+ # Normal message
1066
+ message = ChatMessage (
1067
+ role = role ,
1068
+ reasoning_content = reasoning_content ,
1069
+ content = final_content ,
1070
+ )
1071
+
1072
+ choice_data = ChatCompletionResponseChoice (
1073
+ index = output .index ,
1074
+ message = message ,
1075
+ logprobs = logprobs ,
1076
+ finish_reason = "tool_calls" if is_tool_call else
1077
+ output .finish_reason if output .finish_reason else "stop" ,
1078
+ stop_reason = output .stop_reason ,
1079
+ )
1080
+ choices .append (choice_data )
1081
+ continue
994
1082
995
1083
if self .reasoning_parser :
996
1084
try :
@@ -1003,10 +1091,13 @@ async def chat_completion_full_generator(
1003
1091
reasoning_content , content = (
1004
1092
reasoning_parser .extract_reasoning_content (
1005
1093
output .text , request = request ))
1094
+ if not request .include_reasoning :
1095
+ reasoning_content = None
1006
1096
else :
1007
1097
reasoning_content = None
1008
1098
content = output .text
1009
1099
1100
+ auto_tools_called = False
1010
1101
# if auto tools are not enabled, and a named tool choice using
1011
1102
# outlines is not being used
1012
1103
if (not self .enable_auto_tools or not self .tool_parser ) and \
@@ -1261,3 +1352,33 @@ def _should_check_for_unstreamed_tool_arg_tokens(
1261
1352
and delta_message .tool_calls [0 ].function
1262
1353
and delta_message .tool_calls [0 ].function .arguments is not None
1263
1354
)
1355
+
1356
+ def _make_request_with_harmony (
1357
+ self ,
1358
+ request : ChatCompletionRequest ,
1359
+ ):
1360
+ messages : list [OpenAIMessage ] = []
1361
+
1362
+ # Add system message.
1363
+ # NOTE: In Chat Completion API, browsing is enabled by default
1364
+ # if the model supports it. TODO: Support browsing.
1365
+ assert not self .supports_browsing
1366
+ assert not self .supports_code_interpreter
1367
+ sys_msg = get_system_message (
1368
+ reasoning_effort = request .reasoning_effort ,
1369
+ browser_description = None ,
1370
+ python_description = None )
1371
+ messages .append (sys_msg )
1372
+
1373
+ # Add developer message.
1374
+ dev_msg = get_developer_message ()
1375
+ messages .append (dev_msg )
1376
+
1377
+ # Add user message.
1378
+ for chat_msg in request .messages :
1379
+ messages .append (parse_chat_input (chat_msg ))
1380
+
1381
+ # Render prompt token ids.
1382
+ prompt_token_ids = render_for_completion (messages )
1383
+ engine_prompt = EngineTokensPrompt (prompt_token_ids = prompt_token_ids )
1384
+ return messages , [prompt_token_ids ], [engine_prompt ]
0 commit comments