add model_status

luukunn · luukunn · commit 8bbe39d56a05 · 2025-09-24T17:19:53.000+08:00
diff --git a/fastdeploy/entrypoints/openai/response_processors.py b/fastdeploy/entrypoints/openai/response_processors.py
@@ -67,13 +67,13 @@ def accumulate_token_ids(self, request_output):
             else:
                 self._multipart_buffer.append({"decode_type": decode_type, "request_output": request_output})
 
-    async def process_response_chat(self, request_outputs, stream, enable_thinking, include_stop_str_in_output):
+    async def process_response_chat(self, request_outputs, stream, model_status, include_stop_str_in_output):
         """
         Process a list of responses into a generator that yields each processed response as it's generated.
         Args:
             request_outputs: The list of outputs to be processed.
             stream: Whether or not to stream the output.
-            enable_thinking: Whether or not to show thinking messages.
+            model_status: Whether or not to show thinking messages.
             include_stop_str_in_output: Whether or not to include stop strings in the output.
         """
         for request_output in request_outputs:
@@ -82,7 +82,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
                 yield self.data_processor.process_response_dict(
                     response_dict=request_output,
                     stream=stream,
-                    enable_thinking=enable_thinking,
+                    model_status=model_status,
                     include_stop_str_in_output=include_stop_str_in_output,
                 )
             elif stream:
@@ -108,7 +108,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
                     self.data_processor.process_response_dict(
                         response_dict=request_output,
                         stream=stream,
-                        enable_thinking=enable_thinking,
+                        model_status=model_status,
                         include_stop_str_in_output=include_stop_str_in_output,
                     )
                     text = {"type": "text", "text": request_output["outputs"]["text"]}
@@ -128,7 +128,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
                             self.data_processor.process_response_dict(
                                 response_dict=part["request_output"],
                                 stream=False,
-                                enable_thinking=enable_thinking,
+                                model_status=model_status,
                                 include_stop_str_in_output=include_stop_str_in_output,
                             )
                             text = {"type": "text", "text": part["request_output"]["outputs"]["text"]}
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -120,6 +120,7 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
                 text_after_process = current_req_dict.get("text_after_process")
                 if isinstance(prompt_token_ids, np.ndarray):
                     prompt_token_ids = prompt_token_ids.tolist()
+                model_status = current_req_dict.get("model_status")
             except ParameterError as e:
                 api_server_logger.error(f"request[{request_id}] generator error: {str(e)}, {e.message}")
                 self.engine_client.semaphore.release()
@@ -135,12 +136,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
 
             if request.stream:
                 return self.chat_completion_stream_generator(
-                    request, request_id, request.model, prompt_token_ids, text_after_process
+                    request, request_id, request.model, prompt_token_ids, text_after_process, model_status
                 )
             else:
                 try:
                     return await self.chat_completion_full_generator(
-                        request, request_id, request.model, prompt_token_ids, text_after_process
+                        request, request_id, request.model, prompt_token_ids, text_after_process, model_status
                     )
                 except Exception as e:
                     error_msg = f"request[{request_id}]full generator error: {str(e)}, {str(traceback.format_exc())}"
@@ -168,6 +169,7 @@ async def chat_completion_stream_generator(
         model_name: str,
         prompt_token_ids: list(),
         text_after_process: str,
+        model_status: str,
     ):
         """
         Streaming chat completion generator.
@@ -187,10 +189,6 @@ async def chat_completion_stream_generator(
 
         max_streaming_response_tokens = max(1, max_streaming_response_tokens)
 
-        enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
-        if enable_thinking is None:
-            enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
-
         include_stop_str_in_output = request.include_stop_str_in_output
 
         stream_options = request.stream_options
@@ -242,7 +240,7 @@ async def chat_completion_stream_generator(
                 generator = response_processor.process_response_chat(
                     response,
                     stream=True,
-                    enable_thinking=enable_thinking,
+                    model_status=model_status,
                     include_stop_str_in_output=include_stop_str_in_output,
                 )
 
@@ -412,15 +410,13 @@ async def chat_completion_full_generator(
         model_name: str,
         prompt_token_ids: list(),
         text_after_process: str,
+        model_status: str,
     ):
         """
         Full chat completion generator.
         """
         created_time = int(time.time())
         final_res = None
-        enable_thinking = request.chat_template_kwargs.get("enable_thinking") if request.chat_template_kwargs else None
-        if enable_thinking is None:
-            enable_thinking = request.metadata.get("enable_thinking") if request.metadata else None
 
         include_stop_str_in_output = request.include_stop_str_in_output
         try:
@@ -464,7 +460,7 @@ async def chat_completion_full_generator(
                 generator = response_processor.process_response_chat(
                     response,
                     stream=False,
-                    enable_thinking=enable_thinking,
+                    model_status=model_status,
                     include_stop_str_in_output=include_stop_str_in_output,
                 )
                 async for data in generator: