add model status in vl

luukunn · luukunn · commit 234ef928262d · 2025-09-23T16:03:53.000+08:00
diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py
@@ -232,7 +232,8 @@ def process_request_dict(self, request, max_model_len=None):
             request["top_p"] = _SAMPLING_EPS
         if self.reasoning_parser and self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser":
             request["enable_thinking"] = True
-
+        if self.reasoning_parser:
+            request["model_status"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
         data_processor_logger.info(f"Processed request dict: {request}")
         return request
 
@@ -246,6 +247,7 @@ def process_response(self, response_dict, **kwargs):
         Returns:
             Dict: response contain text fields
         """
+        model_status = kwargs.get("model_status")
         req_id = response_dict.request_id
         token_ids = response_dict.outputs.token_ids
 
@@ -254,7 +256,9 @@ def process_response(self, response_dict, **kwargs):
             token_ids = token_ids[:-1]
         full_text = self.tokenizer.decode(token_ids)
         if self.reasoning_parser:
-            reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
+            reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
+                full_text, response_dict, model_status
+            )
             response_dict.outputs.text = text
             response_dict.outputs.reasoning_content = reasoning_content
         else:
@@ -296,6 +300,7 @@ def process_response_dict_normal(self, response_dict, **kwargs):
             Dict: response contain text fields
         """
         enable_thinking = kwargs.get("enable_thinking")
+        model_status = kwargs.get("model_status")
         token_ids = response_dict["outputs"]["token_ids"]
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
@@ -308,7 +313,9 @@ def process_response_dict_normal(self, response_dict, **kwargs):
             if self.reasoning_parser and (
                 enable_thinking or self.reasoning_parser.__class__.__name__ == "ErnieX1ReasoningParser"
             ):
-                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
+                reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
+                    full_text, response_dict, model_status
+                )
                 response_dict["outputs"]["text"] = text
                 response_dict["outputs"]["reasoning_content"] = reasoning_content
             else:
@@ -335,6 +342,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
             Dict: response contain text fields
         """
         enable_thinking = kwargs.get("enable_thinking")
+        model_status = kwargs.get("model_status")
         is_end = response_dict["finished"]
         req_id = response_dict["request_id"]
         token_ids = response_dict["outputs"]["token_ids"]
@@ -354,6 +362,7 @@ def process_response_dict_streaming(self, response_dict, **kwargs):
                 previous_token_ids,
                 previous_token_ids + token_ids,
                 token_ids,
+                model_status,
             )
             response_dict["outputs"]["delta_message"] = reasoning_delta_message
         if self.tool_parser_obj:
diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
@@ -255,6 +255,9 @@ def process_request_dict(self, request, max_model_len=None):
             request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))
         data_processor_logger.info(f"Processed request {request}")
 
+        if self.reasoning_parser is not None:
+            request["model_status"] = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+
         return request
 
     def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py
@@ -35,6 +35,7 @@ class ErnieVLReasoningParser(ReasoningParser):
 
     def __init__(self, tokenizer):
         super().__init__(tokenizer)
+        self.think_start_token = "</think>"
         self.think_end_token = "</think>"
 
         if not self.model_tokenizer:
@@ -45,10 +46,28 @@ def __init__(self, tokenizer):
         self.think_end_token_id = self.vocab.get(self.think_end_token)
         if self.think_end_token_id is None:
             raise RuntimeError("Ernie VL reasoning parser could not locate think end " "tokens in the tokenizer!")
+        self.think_start_token_id = self.vocab.get(self.think_start_token)
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
         return self.think_end_token_id in input_ids
 
+    def find_last_special_token(self, prompt_token_ids: list[int]) -> int:
+        for i in range(len(prompt_token_ids) - 1, -1, -1):
+            if prompt_token_ids[i] in [self.think_end_token_id, self.think_start_token_id]:
+                return prompt_token_ids[i]
+        return -1
+
+    def get_model_status(self, prompt_token_ids: list[int]):
+        special_token_id = self.find_last_special_token(prompt_token_ids)
+        if special_token_id == -1:
+            return "responding"
+        if special_token_id == self.think_end_token_id:
+            return "responding"
+        if self.think_start_token_id == special_token_id:
+            return "thinking"
+
+        return "responding"
+
     def extract_reasoning_content_streaming(
         self,
         previous_text: str,
@@ -57,6 +76,7 @@ def extract_reasoning_content_streaming(
         previous_token_ids: Sequence[int],
         current_token_ids: Sequence[int],
         delta_token_ids: Sequence[int],
+        model_status: str,
     ) -> Union[DeltaMessage, None]:
         """
         Extract reasoning content from a delta message.
@@ -80,7 +100,10 @@ def extract_reasoning_content_streaming(
             return DeltaMessage(reasoning_content=delta_text)
 
     def extract_reasoning_content(
-        self, model_output: str, request: ChatCompletionRequest
+        self,
+        model_output: str,
+        request: ChatCompletionRequest,
+        model_status: str,
     ) -> tuple[Optional[str], Optional[str]]:
         """
         Extract reasoning content from the model output.
@@ -94,9 +117,11 @@ def extract_reasoning_content(
         """
 
         # Check if the model output contains the </think> tokens.
-        if self.think_end_token not in model_output:
+        if model_status == "thinking":
+            if self.think_end_token not in model_output:
+                return model_output, ""
+            reasoning_content, _, content = model_output.partition(self.think_end_token)
+            final_content = content or ""
+            return reasoning_content, final_content
+        else:
             return "", model_output
-        reasoning_content, _, content = model_output.partition(self.think_end_token)
-
-        final_content = content or ""
-        return reasoning_content, final_content