Add reasoning tag cleanup to DeepResearch plugin

codelion · codelion · commit 26a18df9fd97 · 2025-07-24T20:01:57.000+08:00
Introduces a `clean_reasoning_tags` function to remove reasoning tags (e.g., &lt;think&gt;, &lt;reflection&gt;) from model responses for professional output. Updates DeepResearcher to apply this cleanup at key response stages and documents compatibility and cleanup behavior in the README.
diff --git a/optillm/plugins/deep_research/README.md b/optillm/plugins/deep_research/README.md
@@ -165,6 +165,21 @@ response = client.chat.completions.create(
 - **Token Usage**: 1,000-5,000 tokens per iteration
 - **Memory Requirements**: Scales with content volume and context size
 
+## Reasoning Model Compatibility
+
+The plugin is fully compatible with reasoning models that include internal thinking processes:
+
+- **Automatic Cleanup**: Removes `<think>`, `<thinking>`, `<reasoning>`, `<reflection>` tags from all responses
+- **Professional Output**: Ensures final reports contain only clean, formatted content
+- **Seamless Integration**: Works transparently with any model type
+- **Supported Tags**: `<think>`, `<thinking>`, `<reasoning>`, `<thought>`, `<reflect>`, `<reflection>`
+
+Example cleanup:
+```
+Input:  "<think>Let me analyze this</think>\n\n# Research Report\nContent here..."
+Output: "# Research Report\nContent here..."
+```
+
 ## Error Handling
 
 The plugin includes comprehensive error handling:
@@ -183,6 +198,7 @@ The implementation follows the TTD-DR paper's quality criteria:
 - **Citation Accuracy** - Proper attribution for all claims and findings
 - **Academic Rigor** - Maintains objectivity and scholarly tone
 - **Iterative Refinement** - Continuously improves research quality
+- **Clean Output** - Automatically removes reasoning tags (`<think>`, `<thinking>`, etc.) for professional reports
 
 ## Comparison to Simple Search
 
diff --git a/optillm/plugins/deep_research/research_engine.py b/optillm/plugins/deep_research/research_engine.py
@@ -19,6 +19,48 @@
 from optillm.plugins.memory_plugin import run as memory_run
 
 
+def clean_reasoning_tags(text: str) -> str:
+    """
+    Remove reasoning tags from model responses for clean final output.
+    
+    Removes common reasoning tags like:
+    - <think></think>
+    - <thinking></thinking>
+    - <reasoning></reasoning>
+    - <thought></thought>
+    
+    Args:
+        text: Raw model response text
+        
+    Returns:
+        Cleaned text with reasoning tags removed
+    """
+    if not text:
+        return text
+    
+    # List of reasoning tag patterns to remove
+    reasoning_patterns = [
+        r'<think>.*?</think>',
+        r'<thinking>.*?</thinking>',
+        r'<reasoning>.*?</reasoning>',
+        r'<thought>.*?</thought>',
+        r'<reflect>.*?</reflect>',
+        r'<reflection>.*?</reflection>',
+    ]
+    
+    cleaned_text = text
+    for pattern in reasoning_patterns:
+        # Use DOTALL flag to match across newlines
+        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.DOTALL | re.IGNORECASE)
+    
+    # Clean up any extra whitespace left behind, but preserve markdown formatting
+    cleaned_text = re.sub(r'\n\s*\n\s*\n+', '\n\n', cleaned_text)  # Multiple empty lines to double
+    cleaned_text = re.sub(r'  +', ' ', cleaned_text)  # Multiple spaces to single space (but preserve intentional double spaces)
+    cleaned_text = cleaned_text.strip()
+    
+    return cleaned_text
+
+
 class DeepResearcher:
     """
     Implementation of Test-Time Diffusion Deep Researcher (TTD-DR) algorithm
@@ -77,6 +119,8 @@ def decompose_query(self, system_prompt: str, initial_query: str) -> List[str]:
             )
             
             content = response.choices[0].message.content.strip()
+            # Clean reasoning tags from query decomposition response
+            content = clean_reasoning_tags(content)
             self.total_tokens += response.usage.completion_tokens
             
             # Extract numbered queries
@@ -217,6 +261,8 @@ def synthesize_with_memory(self, system_prompt: str, query: str, content: str, s
         
         try:
             synthesis, tokens = memory_run(system_prompt, memory_input, self.client, self.model)
+            # Clean reasoning tags from synthesis response
+            synthesis = clean_reasoning_tags(synthesis)
             return synthesis, tokens
         except Exception as e:
             return f"Memory synthesis failed: {str(e)}", 0
@@ -254,6 +300,8 @@ def evaluate_completeness(self, system_prompt: str, query: str, current_synthesi
             )
             
             content = response.choices[0].message.content.strip()
+            # Clean reasoning tags from completeness evaluation response
+            content = clean_reasoning_tags(content)
             self.total_tokens += response.usage.completion_tokens
             
             # Parse response
@@ -352,6 +400,8 @@ def generate_structured_report(self, system_prompt: str, original_query: str, sy
             )
             
             report_content = response.choices[0].message.content.strip()
+            # Clean reasoning tags from final report response
+            report_content = clean_reasoning_tags(report_content)
             self.total_tokens += response.usage.completion_tokens
             
             # Add references section with proper formatting