Skip to content

Commit 8bbe39d

Browse files
committed
add model_status
1 parent 671a4dc commit 8bbe39d

File tree

2 files changed

+12
-16
lines changed

2 files changed

+12
-16
lines changed

fastdeploy/entrypoints/openai/response_processors.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,13 @@ def accumulate_token_ids(self, request_output):
6767
else:
6868
self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})
6969

70-
async def process_response_chat(self, request_outputs, stream, enable_thinking, include_stop_str_in_output):
70+
async def process_response_chat(self, request_outputs, stream, model_status, include_stop_str_in_output):
7171
"""
7272
Process a list of responses into a generator that yields each processed response as it's generated.
7373
Args:
7474
request_outputs: The list of outputs to be processed.
7575
stream: Whether or not to stream the output.
76-
enable_thinking: Whether or not to show thinking messages.
76+
model_status: Whether or not to show thinking messages.
7777
include_stop_str_in_output: Whether or not to include stop strings in the output.
7878
"""
7979
for request_output in request_outputs:
@@ -82,7 +82,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
8282
yield self.data_processor.process_response_dict(
8383
response_dict=request_output,
8484
stream=stream,
85-
enable_thinking=enable_thinking,
85+
model_status=model_status,
8686
include_stop_str_in_output=include_stop_str_in_output,
8787
)
8888
elif stream:
@@ -108,7 +108,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
108108
self.data_processor.process_response_dict(
109109
response_dict=request_output,
110110
stream=stream,
111-
enable_thinking=enable_thinking,
111+
model_status=model_status,
112112
include_stop_str_in_output=include_stop_str_in_output,
113113
)
114114
text = {"type": "text", "text": request_output["outputs"]["text"]}
@@ -128,7 +128,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
128128
self.data_processor.process_response_dict(
129129
response_dict=part["request_output"],
130130
stream=False,
131-
enable_thinking=enable_thinking,
131+
model_status=model_status,
132132
include_stop_str_in_output=include_stop_str_in_output,
133133
)
134134
text = {"type": "text", "text": part["request_output"]["outputs"]["text"]}

fastdeploy/entrypoints/openai/serving_chat.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
120120
text_after_process = current_req_dict.get("text_after_process")
121121
if isinstance(prompt_token_ids, np.ndarray):
122122
prompt_token_ids = prompt_token_ids.tolist()
123+
model_status = current_req_dict.get("model_status")
123124
except ParameterError as e:
124125
api_server_logger.error(f"request[{request_id}] generator error: {str(e)}, {e.message}")
125126
self.engine_client.semaphore.release()
@@ -135,12 +136,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
135136

136137
if request.stream:
137138
return self.chat_completion_stream_generator(
138-
request, request_id, request.model, prompt_token_ids, text_after_process
139+
request, request_id, request.model, prompt_token_ids, text_after_process, model_status
139140
)
140141
else:
141142
try:
142143
return await self.chat_completion_full_generator(
143-
request, request_id, request.model, prompt_token_ids, text_after_process
144+
request, request_id, request.model, prompt_token_ids, text_after_process, model_status
144145
)
145146
except Exception as e:
146147
error_msg = f"request[{request_id}]full generator error: {str(e)}, {str(traceback.format_exc())}"
@@ -168,6 +169,7 @@ async def chat_completion_stream_generator(
168169
model_name: str,
169170
prompt_token_ids: list(),
170171
text_after_process: str,
172+
model_status: str,
171173
):
172174
"""
173175
Streaming chat completion generator.
@@ -187,10 +189,6 @@ async def chat_completion_stream_generator(
187189

188190
max_streaming_response_tokens = max(1, max_streaming_response_tokens)
189191

190-
enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
191-
if enable_thinking is None:
192-
enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
193-
194192
include_stop_str_in_output = request.include_stop_str_in_output
195193

196194
stream_options = request.stream_options
@@ -242,7 +240,7 @@ async def chat_completion_stream_generator(
242240
generator = response_processor.process_response_chat(
243241
response,
244242
stream=True,
245-
enable_thinking=enable_thinking,
243+
model_status=model_status,
246244
include_stop_str_in_output=include_stop_str_in_output,
247245
)
248246

@@ -412,15 +410,13 @@ async def chat_completion_full_generator(
412410
model_name: str,
413411
prompt_token_ids: list(),
414412
text_after_process: str,
413+
model_status: str,
415414
):
416415
"""
417416
Full chat completion generator.
418417
"""
419418
created_time = int(time.time())
420419
final_res = None
421-
enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
422-
if enable_thinking is None:
423-
enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
424420

425421
include_stop_str_in_output = request.include_stop_str_in_output
426422
try:
@@ -464,7 +460,7 @@ async def chat_completion_full_generator(
464460
generator = response_processor.process_response_chat(
465461
response,
466462
stream=False,
467-
enable_thinking=enable_thinking,
463+
model_status=model_status,
468464
include_stop_str_in_output=include_stop_str_in_output,
469465
)
470466
async for data in generator:

0 commit comments

Comments
 (0)