fix

luukunn · luukunn · commit e49676cdf6af · 2025-09-26T17:43:31.000+08:00
diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py
@@ -71,7 +71,8 @@ def __init__(
         guided_grammar: Optional[Any] = None,
         structural_tag: Optional[Any] = None,
         guided_json_object: Optional[bool] = None,
-        enable_thinking: Optional[bool] = True,
+        enable_thinking: Optional[bool] = False,
+        model_status: Optional[str] = None,
         trace_carrier: dict = dict(),
         dp_rank: Optional[int] = None,
         chat_template: Optional[str] = None,
diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -120,7 +120,6 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
                 text_after_process = current_req_dict.get("text_after_process")
                 if isinstance(prompt_token_ids, np.ndarray):
                     prompt_token_ids = prompt_token_ids.tolist()
-                model_status = current_req_dict.get("model_status")
             except ParameterError as e:
                 api_server_logger.error(f"request[{request_id}] generator error: {str(e)}, {e.message}")
                 self.engine_client.semaphore.release()
@@ -136,12 +135,12 @@ async def create_chat_completion(self, request: ChatCompletionRequest):
 
             if request.stream:
                 return self.chat_completion_stream_generator(
-                    request, request_id, request.model, prompt_token_ids, text_after_process, model_status
+                    request, request_id, request.model, prompt_token_ids, text_after_process
                 )
             else:
                 try:
                     return await self.chat_completion_full_generator(
-                        request, request_id, request.model, prompt_token_ids, text_after_process, model_status
+                        request, request_id, request.model, prompt_token_ids, text_after_process
                     )
                 except Exception as e:
                     error_msg = f"request[{request_id}]full generator error: {str(e)}, {str(traceback.format_exc())}"
@@ -169,7 +168,6 @@ async def chat_completion_stream_generator(
         model_name: str,
         prompt_token_ids: list(),
         text_after_process: str,
-        model_status: str,
     ):
         """
         Streaming chat completion generator.
@@ -240,7 +238,6 @@ async def chat_completion_stream_generator(
                 generator = response_processor.process_response_chat(
                     response,
                     stream=True,
-                    model_status=model_status,
                     include_stop_str_in_output=include_stop_str_in_output,
                 )
 
@@ -410,7 +407,6 @@ async def chat_completion_full_generator(
         model_name: str,
         prompt_token_ids: list(),
         text_after_process: str,
-        model_status: str,
     ):
         """
         Full chat completion generator.
@@ -460,7 +456,6 @@ async def chat_completion_full_generator(
                 generator = response_processor.process_response_chat(
                     response,
                     stream=False,
-                    model_status=model_status,
                     include_stop_str_in_output=include_stop_str_in_output,
                 )
                 async for data in generator:
diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py
@@ -60,6 +60,7 @@ def __init__(self, model_name_or_path, reasoning_parser_obj=None, tool_parser_ob
         self.decode_status = dict()
         self.tool_parser_dict = dict()
         self.thinking_parser_dict = dict()
+        self.model_status_dict = dict()
         self._load_tokenizer()
         data_processor_logger.info(
             f"tokenizer information: bos_token is {self.tokenizer.bos_token} \
@@ -154,6 +155,12 @@ def process_request(self, request, max_model_len=None, **kwargs):
             request.set("top_p", _SAMPLING_EPS)
         if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
             request.enable_thinking = True
+        if self.reasoning_parser:
+            self.model_status_dict[request.request_id] = self.reasoning_parser.get_model_status(
+                request.prompt_token_ids
+            )
+            if self.model_status_dict[request.request_id] == "think_start":
+                request.enable_thinking = True
 
         data_processor_logger.info(f"Processed request: {request}")
         return request
@@ -233,8 +240,8 @@ def process_request_dict(self, request, max_model_len=None):
         if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
             request["enable_thinking"] = True
         if self.reasoning_parser:
-            request["model_status"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
-            if request["model_status"] == "think_start":
+            self.model_status_dict["request_id"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+            if self.model_status_dict["request_id"] == "think_start":
                 request["enable_thinking"] = True
         data_processor_logger.info(f"Processed request dict: {request}")
         return request
@@ -274,6 +281,8 @@ def process_response(self, response_dict, **kwargs):
         data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
         if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
             return None
+        if req_id in self.model_status_dict:
+            del self.model_status_dict[req_id]
         return response_dict
 
     def process_response_dict(self, response_dict, stream, **kwargs):
@@ -302,7 +311,6 @@ def process_response_dict_normal(self, response_dict, **kwargs):
             Dict: response contain text fields
         """
         enable_thinking = kwargs.get("enable_thinking")
-        model_status = kwargs.get("model_status")
         token_ids = response_dict["outputs"]["token_ids"]
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
@@ -317,7 +325,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
                 enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
             ):
                 reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
-                    full_text, response_dict, model_status
+                    full_text, response_dict, self.model_status_dict.get(req_id)
                 )
                 response_dict["outputs"]["text"] = text
                 response_dict["outputs"]["reasoning_content"] = reasoning_content
@@ -330,6 +338,8 @@ def process_response_dict_normal(self, response_dict, **kwargs):
             response_dict["outputs"]["raw_prediction"] = full_text
             data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}")
             del self.decode_status[req_id]
+            if req_id in self.model_status_dict:
+                del self.model_status_dict[req_id]
         return response_dict
 
     def process_response_dict_streaming(self, response_dict, **kwargs):
@@ -343,7 +353,6 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             Dict: response contain text fields
         """
         enable_thinking = kwargs.get("enable_thinking")
-        model_status = kwargs.get("model_status")
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
         token_ids = response_dict["outputs"]["token_ids"]
@@ -363,7 +372,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
                 previous_token_ids,
                 previous_token_ids + token_ids,
                 token_ids,
-                model_status,
+                self.model_status_dict.get(req_id),
             )
             response_dict["outputs"]["delta_message"] = reasoning_delta_message
         if self.tool_parser_obj:
@@ -387,6 +396,8 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             del self.decode_status[req_id]
             if req_id in self.tool_parser_dict:
                 del self.tool_parser_dict[req_id]
+            if req_id in self.model_status_dict:
+                del self.model_status_dict[req_id]
         return response_dict
 
     def messages2ids(self, request_or_messages):
diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
@@ -265,6 +265,10 @@ def process_request(self, request, max_model_len=None, **kwargs):
             request.set("temperature", 1)
         if request.get("top_p") < _SAMPLING_EPS:
             request.set("top_p", _SAMPLING_EPS)
+        if self.reasoning_parser:
+            request.model_status = self.reasoning_parser.get_model_status(request.prompt_token_ids)
+            if request.model_status == "think_start":
+                request.enable_thinking = True
 
         data_processor_logger.info(f"Processed request: {request}")
         return request
diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py
@@ -35,38 +35,47 @@ class ErnieVLReasoningParser(ReasoningParser):
 
     def __init__(self, tokenizer):
         super().__init__(tokenizer)
-        self.think_start_token = "</think>"
-        self.think_end_token = "</think>"
+        token_definitions = {
+            "think_start_token": "<think>",
+            "think_end_token": "</think>",
+        }
 
         if not self.model_tokenizer:
-            raise ValueError(
-                "The model tokenizer must be passed to the ReasoningParser " "constructor during construction."
+            raise ValueError("The model tokenizer must be passed to the ReasoningParser constructor.")
+
+        missing_tokens = []
+        for name, token_value in token_definitions.items():
+            setattr(self, name, token_value)
+            token_id = self.vocab.get(token_value)
+            setattr(self, f"{name}_id", token_id)
+            if token_id is None:
+                missing_tokens.append(f"{name.replace('_', ' ')} token")
+
+        if missing_tokens:
+            raise RuntimeError(
+                f"Could not find the following token ids in tokenizer vocabulary: {', '.join(missing_tokens)}"
             )
-
-        self.think_end_token_id = self.vocab.get(self.think_end_token)
-        if self.think_end_token_id is None:
-            raise RuntimeError("Ernie VL reasoning parser could not locate think end " "tokens in the tokenizer!")
-        self.think_start_token_id = self.vocab.get(self.think_start_token)
+        self.token_status_mapping = {
+            self.think_start_token_id: "think_start",
+            self.think_end_token_id: "think_end",
+        }
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.think_end_token_id in input_ids
 
     def find_last_special_token(self, prompt_token_ids: list[int]) -> int:
         for i in range(len(prompt_token_ids) - 1, -1, -1):
-            if prompt_token_ids[i] in [self.think_end_token_id, self.think_start_token_id]:
+            if prompt_token_ids[i] in self.token_status_mapping:
                 return prompt_token_ids[i]
         return -1
 
     def get_model_status(self, prompt_token_ids: list[int]):
         special_token_id = self.find_last_special_token(prompt_token_ids)
+
         if special_token_id == -1:
-            return "responding"
-        if special_token_id == self.think_end_token_id:
-            return "responding"
-        if self.think_start_token_id == special_token_id:
-            return "thinking"
+            return "think_start"
 
-        return "responding"
+        return self.token_status_mapping[special_token_id]
 
     def extract_reasoning_content_streaming(
         self,
@@ -89,15 +98,18 @@ def extract_reasoning_content_streaming(
         # Skip single special tokens
         if len(delta_token_ids) == 1 and delta_token_ids[0] == self.think_end_token_id:
             return None
-        if self.think_end_token_id in delta_token_ids:
-            end_index = delta_text.find(self.end_token)
-            reasoning_content = delta_text[:end_index]
-            content = delta_text[end_index + len(self.end_token) :]
-            return DeltaMessage(reasoning_content=reasoning_content, content=content)
-        elif self.think_end_token_id in previous_token_ids:
-            return DeltaMessage(content=delta_text)
+        if model_status == "think_start":
+            if self.think_end_token_id in delta_token_ids:
+                end_index = delta_text.find(self.end_token)
+                reasoning_content = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token) :]
+                return DeltaMessage(reasoning_content=reasoning_content, content=content)
+            elif self.think_end_token_id in previous_token_ids:
+                return DeltaMessage(content=delta_text)
+            else:
+                return DeltaMessage(reasoning_content=delta_text)
         else:
-            return DeltaMessage(reasoning_content=delta_text)
+            return DeltaMessage(content=delta_text)
 
     def extract_reasoning_content(
         self,
@@ -117,7 +129,7 @@ def extract_reasoning_content(
         """
 
         # Check if the model output contains the </think> tokens.
-        if model_status == "thinking":
+        if model_status == "think_start":
             if self.think_end_token not in model_output:
                 return model_output, ""
             reasoning_content, _, content = model_output.partition(self.think_end_token)
diff --git a/fastdeploy/reasoning/qwen3_reasoning_parsers.py b/fastdeploy/reasoning/qwen3_reasoning_parsers.py
@@ -51,6 +51,9 @@ def __init__(self, tokenizer):
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.think_end_token_id in input_ids
 
+    def get_model_status(self, prompt_token_ids: list[int]):
+        return "think_start"
+
     def extract_reasoning_content_streaming(
         self,
         previous_text: str,
@@ -59,6 +62,7 @@ def extract_reasoning_content_streaming(
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
+        model_status: str,
     ) -> Union[DeltaMessage, None]:
         """
         Extract reasoning content from a delta message.
@@ -103,7 +107,7 @@ def extract_reasoning_content_streaming(
             return DeltaMessage(reasoning_content=delta_text)
 
     def extract_reasoning_content(
-        self, model_output: str, request: ChatCompletionRequest
+        self, model_output: str, request: ChatCompletionRequest, model_status: str
     ) -> tuple[Optional[str], Optional[str]]:
         """
         Extract reasoning content from the model output.