@@ -120,6 +120,7 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
120
120
text_after_process = current_req_dict .get ("text_after_process" )
121
121
if isinstance (prompt_token_ids , np .ndarray ):
122
122
prompt_token_ids = prompt_token_ids .tolist ()
123
+ model_status = current_req_dict .get ("model_status" )
123
124
except ParameterError as e :
124
125
api_server_logger .error (f"request[{ request_id } ] generator error: { str (e )} , { e .message } " )
125
126
self .engine_client .semaphore .release ()
@@ -135,12 +136,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
135
136
136
137
if request .stream :
137
138
return self .chat_completion_stream_generator (
138
- request , request_id , request .model , prompt_token_ids , text_after_process
139
+ request , request_id , request .model , prompt_token_ids , text_after_process , model_status
139
140
)
140
141
else :
141
142
try :
142
143
return await self .chat_completion_full_generator (
143
- request , request_id , request .model , prompt_token_ids , text_after_process
144
+ request , request_id , request .model , prompt_token_ids , text_after_process , model_status
144
145
)
145
146
except Exception as e :
146
147
error_msg = f"request[{ request_id } ]full generator error: { str (e )} , { str (traceback .format_exc ())} "
@@ -168,6 +169,7 @@ async def chat_completion_stream_generator(
168
169
model_name : str ,
169
170
prompt_token_ids : list (),
170
171
text_after_process : str ,
172
+ model_status : str ,
171
173
):
172
174
"""
173
175
Streaming chat completion generator.
@@ -187,10 +189,6 @@ async def chat_completion_stream_generator(
187
189
188
190
max_streaming_response_tokens = max (1 , max_streaming_response_tokens )
189
191
190
- enable_thinking = request .chat_template_kwargs .get ("enable_thinking" ) if request .chat_template_kwargs else None
191
- if enable_thinking is None :
192
- enable_thinking = request .metadata .get ("enable_thinking" ) if request .metadata else None
193
-
194
192
include_stop_str_in_output = request .include_stop_str_in_output
195
193
196
194
stream_options = request .stream_options
@@ -242,7 +240,7 @@ async def chat_completion_stream_generator(
242
240
generator = response_processor .process_response_chat (
243
241
response ,
244
242
stream = True ,
245
- enable_thinking = enable_thinking ,
243
+ model_status = model_status ,
246
244
include_stop_str_in_output = include_stop_str_in_output ,
247
245
)
248
246
@@ -412,15 +410,13 @@ async def chat_completion_full_generator(
412
410
model_name : str ,
413
411
prompt_token_ids : list (),
414
412
text_after_process : str ,
413
+ model_status : str ,
415
414
):
416
415
"""
417
416
Full chat completion generator.
418
417
"""
419
418
created_time = int (time .time ())
420
419
final_res = None
421
- enable_thinking = request .chat_template_kwargs .get ("enable_thinking" ) if request .chat_template_kwargs else None
422
- if enable_thinking is None :
423
- enable_thinking = request .metadata .get ("enable_thinking" ) if request .metadata else None
424
420
425
421
include_stop_str_in_output = request .include_stop_str_in_output
426
422
try :
@@ -464,7 +460,7 @@ async def chat_completion_full_generator(
464
460
generator = response_processor .process_response_chat (
465
461
response ,
466
462
stream = False ,
467
- enable_thinking = enable_thinking ,
463
+ model_status = model_status ,
468
464
include_stop_str_in_output = include_stop_str_in_output ,
469
465
)
470
466
async for data in generator :
0 commit comments