🐛 When generating prompts using deep thinking models, it is necessary to remove the deep thinking content. #1229

Phinease · web-flow · commit a7cf50e5a2ac · 2025-09-18T09:38:23.000+08:00
diff --git a/backend/consts/const.py b/backend/consts/const.py
@@ -239,3 +239,7 @@
     "PROCESS_FAILED": "PROCESS_FAILED",
     "FORWARD_FAILED": "FORWARD_FAILED",
 }
+
+# Deep Thinking Constants
+THINK_START_PATTERN = "<think>"
+THINK_END_PATTERN = "</think>"
diff --git a/backend/services/conversation_management_service.py b/backend/services/conversation_management_service.py
@@ -5,7 +5,6 @@
 from typing import Any, Dict, List, Optional
 
 from jinja2 import StrictUndefined, Template
-from nexent.core.utils.observer import ProcessType
 from smolagents import OpenAIServerModel
 
 from consts.const import LANGUAGE, MODEL_CONFIG_MAPPING, MESSAGE_ROLE
@@ -28,9 +27,10 @@
     rename_conversation,
     update_message_opinion
 )
+from nexent.core.utils.observer import ProcessType
 from utils.config_utils import get_model_name_from_config, tenant_config_manager
 from utils.prompt_template_utils import get_generate_title_prompt_template
-from utils.str_utils import add_no_think_token, remove_think_tags
+from utils.str_utils import remove_think_blocks
 
 logger = logging.getLogger("conversation_management_service")
 
@@ -274,12 +274,11 @@ def call_llm_for_title(content: str, tenant_id: str, language: str = LANGUAGE["Z
                  "content": prompt_template["SYSTEM_PROMPT"]},
                 {"role": MESSAGE_ROLE["USER"],
                  "content": user_prompt}]
-    add_no_think_token(messages)
 
     # Call the model
     response = llm(messages, max_tokens=10)
 
-    return remove_think_tags(response.content.strip())
+    return remove_think_blocks(response.content.strip())
 
 
 def update_conversation_title(conversation_id: int, title: str, user_id: str = None) -> bool:
diff --git a/backend/services/prompt_service.py b/backend/services/prompt_service.py
@@ -6,19 +6,47 @@
 from jinja2 import StrictUndefined, Template
 from smolagents import OpenAIServerModel
 
-from consts.const import LANGUAGE, MODEL_CONFIG_MAPPING, MESSAGE_ROLE
+from consts.const import LANGUAGE, MODEL_CONFIG_MAPPING, MESSAGE_ROLE, THINK_END_PATTERN, THINK_START_PATTERN
 from consts.model import AgentInfoRequest
 from database.agent_db import update_agent, query_sub_agents_id_list, search_agent_info_by_agent_id
 from database.tool_db import query_tools_by_ids
 from services.agent_service import get_enable_tool_id_by_agent_id
 from utils.config_utils import tenant_config_manager, get_model_name_from_config
 from utils.prompt_template_utils import get_prompt_generate_prompt_template
-from utils.str_utils import remove_think_tags, add_no_think_token
 
 # Configure logging
 logger = logging.getLogger("prompt_service")
 
 
+def _process_thinking_tokens(new_token: str, is_thinking: bool, token_join: list, callback=None) -> bool:
+    """
+    Process tokens to filter out thinking content between <think> and </think> tags
+
+    Args:
+        new_token: Current token from LLM stream
+        is_thinking: Current thinking state
+        token_join: List to accumulate non-thinking tokens
+        callback: Callback function for streaming output
+
+    Returns:
+        bool: updated_is_thinking
+    """
+    # Handle thinking mode
+    if is_thinking:
+        return not (THINK_END_PATTERN in new_token)
+
+    # Handle start of thinking
+    if THINK_START_PATTERN in new_token:
+        return True
+
+    # Normal token processing
+    token_join.append(new_token)
+    if callback:
+        callback("".join(token_join))
+
+    return False
+
+
 def call_llm_for_system_prompt(user_prompt: str, system_prompt: str, callback=None, tenant_id: str = None) -> str:
     """
     Call LLM to generate system prompt
@@ -45,7 +73,6 @@ def call_llm_for_system_prompt(user_prompt: str, system_prompt: str, callback=No
     )
     messages = [{"role": MESSAGE_ROLE["SYSTEM"], "content": system_prompt},
                 {"role": MESSAGE_ROLE["USER"], "content": user_prompt}]
-    add_no_think_token(messages)
     try:
         completion_kwargs = llm._prepare_completion_kwargs(
             messages=messages,
@@ -56,14 +83,13 @@ def call_llm_for_system_prompt(user_prompt: str, system_prompt: str, callback=No
         current_request = llm.client.chat.completions.create(
             stream=True, **completion_kwargs)
         token_join = []
+        is_thinking = False
         for chunk in current_request:
             new_token = chunk.choices[0].delta.content
             if new_token is not None:
-                new_token = remove_think_tags(new_token)
-                token_join.append(new_token)
-                current_text = "".join(token_join)
-                if callback is not None:
-                    callback(current_text)
+                is_thinking = _process_thinking_tokens(
+                    new_token, is_thinking, token_join, callback
+                )
         return "".join(token_join)
     except Exception as e:
         logger.error(f"Failed to generate prompt from LLM: {str(e)}")
diff --git a/backend/utils/str_utils.py b/backend/utils/str_utils.py
@@ -1,21 +1,8 @@
-from typing import List
+import re
 
 
-def remove_think_tags(text: str) -> str:
-    """
-    Remove thinking tags from text
-
-    Args:
-        text: Input text that may contain thinking tags
-
-    Returns:
-        str: Text with thinking tags removed
-    """
-    return text.replace("<think>", "").replace("</think>", "")
-
-
-def add_no_think_token(messages: List[dict]):
-    if not messages:
-        return
-    if messages[-1]["role"] == "user" and "content" in messages[-1]:
-        messages[-1]["content"] += " /no_think"
+def remove_think_blocks(text: str) -> str:
+    """Remove <think>...</think> blocks including inner content."""
+    if not text:
+        return text
+    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL | re.IGNORECASE)
diff --git a/test/backend/services/test_prompt_service.py b/test/backend/services/test_prompt_service.py
@@ -25,7 +25,8 @@
                 get_enabled_tool_description_for_generate_prompt,
                 get_enabled_sub_agent_description_for_generate_prompt,
                 generate_system_prompt,
-                join_info_for_generate_system_prompt
+                join_info_for_generate_system_prompt,
+                _process_thinking_tokens
             )
 
 
@@ -38,17 +39,14 @@ def setUp(self):
     @patch('backend.services.prompt_service.OpenAIServerModel')
     @patch('backend.services.prompt_service.tenant_config_manager')
     @patch('backend.services.prompt_service.get_model_name_from_config')
-    @patch('backend.services.prompt_service.remove_think_tags')
-    def test_call_llm_for_system_prompt(self, mock_remove_think_tags,
-                                        mock_get_model_name, mock_tenant_config, mock_openai):
+    def test_call_llm_for_system_prompt(self, mock_get_model_name, mock_tenant_config, mock_openai):
         # Setup
         mock_model_config = {
             "base_url": "http://example.com",
             "api_key": "fake-key"
         }
         mock_tenant_config.get_model_config.return_value = mock_model_config
         mock_get_model_name.return_value = "gpt-4"
-        mock_remove_think_tags.side_effect = lambda x: x  # Return input unchanged
 
         mock_llm_instance = mock_openai.return_value
 
@@ -487,6 +485,147 @@ def test_call_llm_for_system_prompt_exception(self, mock_get_model_name, mock_te
 
         self.assertIn("LLM error", str(context.exception))
 
+    def test_process_thinking_tokens_normal_token(self):
+        """Test process_thinking_tokens with normal token when not thinking"""
+        token_join = []
+        callback_calls = []
+
+        def mock_callback(text):
+            callback_calls.append(text)
+
+        is_thinking = _process_thinking_tokens(
+            "Hello", False, token_join, mock_callback)
+
+        self.assertFalse(is_thinking)
+        self.assertEqual(token_join, ["Hello"])
+        self.assertEqual(callback_calls, ["Hello"])
+
+    def test_process_thinking_tokens_start_thinking(self):
+        """Test process_thinking_tokens when encountering <think> tag"""
+        token_join = []
+        callback_calls = []
+
+        def mock_callback(text):
+            callback_calls.append(text)
+
+        is_thinking = _process_thinking_tokens(
+            "<think>", False, token_join, mock_callback)
+
+        self.assertTrue(is_thinking)
+        self.assertEqual(token_join, [])
+        self.assertEqual(callback_calls, [])
+
+    def test_process_thinking_tokens_content_while_thinking(self):
+        """Test process_thinking_tokens with content while in thinking mode"""
+        token_join = ["Hello"]
+        callback_calls = []
+
+        def mock_callback(text):
+            callback_calls.append(text)
+
+        is_thinking = _process_thinking_tokens(
+            "thinking content", True, token_join, mock_callback)
+
+        self.assertTrue(is_thinking)
+        self.assertEqual(token_join, ["Hello"])  # Should not change
+        self.assertEqual(callback_calls, [])
+
+    def test_process_thinking_tokens_end_thinking(self):
+        """Test process_thinking_tokens when encountering </think> tag"""
+        token_join = ["Hello"]
+        callback_calls = []
+
+        def mock_callback(text):
+            callback_calls.append(text)
+
+        is_thinking = _process_thinking_tokens(
+            "</think>", True, token_join, mock_callback)
+
+        self.assertFalse(is_thinking)
+        self.assertEqual(token_join, ["Hello"])  # Should not change
+        self.assertEqual(callback_calls, [])
+
+    def test_process_thinking_tokens_content_after_thinking(self):
+        """Test process_thinking_tokens with content after thinking ends"""
+        token_join = ["Hello"]
+        callback_calls = []
+
+        def mock_callback(text):
+            callback_calls.append(text)
+
+        is_thinking = _process_thinking_tokens(
+            "World", False, token_join, mock_callback)
+
+        self.assertFalse(is_thinking)
+        self.assertEqual(token_join, ["Hello", "World"])
+        self.assertEqual(callback_calls, ["HelloWorld"])
+
+    def test_process_thinking_tokens_complete_flow(self):
+        """Test process_thinking_tokens with complete thinking flow"""
+        token_join = []
+        callback_calls = []
+
+        def mock_callback(text):
+            callback_calls.append(text)
+
+        # Start with normal content
+        is_thinking = _process_thinking_tokens(
+            "Start ", False, token_join, mock_callback)
+        self.assertFalse(is_thinking)
+
+        # Enter thinking mode
+        is_thinking = _process_thinking_tokens(
+            "<think>", False, token_join, mock_callback)
+        self.assertTrue(is_thinking)
+
+        # Thinking content (ignored)
+        is_thinking = _process_thinking_tokens(
+            "thinking", True, token_join, mock_callback)
+        self.assertTrue(is_thinking)
+
+        # More thinking content (ignored)
+        is_thinking = _process_thinking_tokens(
+            " more", True, token_join, mock_callback)
+        self.assertTrue(is_thinking)
+
+        # End thinking
+        is_thinking = _process_thinking_tokens(
+            "</think>", True, token_join, mock_callback)
+        self.assertFalse(is_thinking)
+
+        # Continue with normal content
+        is_thinking = _process_thinking_tokens(
+            " End", False, token_join, mock_callback)
+        self.assertFalse(is_thinking)
+
+        # Verify final state
+        self.assertEqual(token_join, ["Start ", " End"])
+        self.assertEqual(callback_calls, ["Start ", "Start  End"])
+
+    def test_process_thinking_tokens_no_callback(self):
+        """Test process_thinking_tokens without callback function"""
+        token_join = []
+
+        is_thinking = _process_thinking_tokens("Hello", False, token_join, None)
+
+        self.assertFalse(is_thinking)
+        self.assertEqual(token_join, ["Hello"])
+
+    def test_process_thinking_tokens_empty_token(self):
+        """Test process_thinking_tokens with empty token"""
+        token_join = []
+        callback_calls = []
+
+        def mock_callback(text):
+            callback_calls.append(text)
+
+        is_thinking = _process_thinking_tokens(
+            "", False, token_join, mock_callback)
+
+        self.assertFalse(is_thinking)
+        self.assertEqual(token_join, [""])
+        self.assertEqual(callback_calls, [""])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/backend/utils/test_str_utils.py b/test/backend/utils/test_str_utils.py

Original file line number	Diff line number	Diff line change
`@@ -239,3 +239,7 @@`
`239`	`239`	`"PROCESS_FAILED": "PROCESS_FAILED",`
`240`	`240`	`"FORWARD_FAILED": "FORWARD_FAILED",`
`241`	`241`	`}`
	`242`	`+`
	`243`	`+# Deep Thinking Constants`
	`244`	`+THINK_START_PATTERN = "<think>"`
	`245`	`+THINK_END_PATTERN = "</think>"`