Improve timeout handling and robustness in plugins

codelion · codelion · commit c95f6437d60c · 2025-07-25T17:38:57.000+08:00
Extended timeout and retry logic for Gradio chat and deep research plugins to support long-running operations. Enhanced DeepResearcher prompts for more explicit gap analysis and research needs. Improved browser session recovery in web search plugin to handle invalidated sessions and prevent crashes. Updated default iteration and source limits for deep research to balance speed and coverage.
diff --git a/optillm.py b/optillm.py
@@ -773,7 +773,11 @@ def parse_args():
         if extra and extra[0]:  # Check if there are choices for this argument
             parser.add_argument(arg, type=type_, default=default, help=help_text, choices=extra[0])
         else:
-            parser.add_argument(arg, type=type_, default=default, help=help_text)
+            if type_ == bool:
+                # For boolean flags, use store_true action
+                parser.add_argument(arg, action='store_true', default=default, help=help_text)
+            else:
+                parser.add_argument(arg, type=type_, default=default, help=help_text)
 
     # Special handling for best_of_n to support both formats
     best_of_n_default = int(os.environ.get("OPTILLM_BEST_OF_N", 3))
@@ -855,12 +859,45 @@ def main():
             base_url = f"http://localhost:{port}/v1"
             logger.info(f"Launching Gradio interface connected to {base_url}")
             
-            # Launch Gradio interface
-            demo = gr.load_chat(
-                base_url,
-                model=server_config['model'],
-                token=None
+            # Create custom chat function with extended timeout
+            def chat_with_optillm(message, history):
+                import httpx
+                from openai import OpenAI
+                
+                # Create client with extended timeout and no retries
+                custom_client = OpenAI(
+                    api_key="optillm",
+                    base_url=base_url,
+                    timeout=httpx.Timeout(1800.0, connect=5.0),  # 30 min timeout
+                    max_retries=0  # No retries - prevents duplicate requests
+                )
+                
+                # Convert history to messages format
+                messages = []
+                for h in history:
+                    if h[0]:  # User message
+                        messages.append({"role": "user", "content": h[0]})
+                    if h[1]:  # Assistant message
+                        messages.append({"role": "assistant", "content": h[1]})
+                messages.append({"role": "user", "content": message})
+                
+                # Make request
+                try:
+                    response = custom_client.chat.completions.create(
+                        model=server_config['model'],
+                        messages=messages
+                    )
+                    return response.choices[0].message.content
+                except Exception as e:
+                    return f"Error: {str(e)}"
+            
+            # Create Gradio interface with queue for long operations
+            demo = gr.ChatInterface(
+                chat_with_optillm,
+                title="OptILLM Chat Interface",
+                description=f"Connected to OptILLM proxy at {base_url}"
             )
+            demo.queue()  # Enable queue to handle long operations properly
             demo.launch(server_name="0.0.0.0", share=False)
         except ImportError:
             logger.error("Gradio is required for GUI. Install it with: pip install gradio")
diff --git a/optillm/plugins/deep_research/research_engine.py b/optillm/plugins/deep_research/research_engine.py
@@ -225,7 +225,7 @@ class DeepResearcher:
     Based on: https://arxiv.org/abs/2507.16075v1
     """
     
-    def __init__(self, client, model: str, max_iterations: int = 8, max_sources: int = 15):
+    def __init__(self, client, model: str, max_iterations: int = 5, max_sources: int = 30):
         self.client = client
         self.model = model
         self.max_iterations = max_iterations
@@ -606,10 +606,17 @@ def generate_preliminary_draft(self, system_prompt: str, initial_query: str) ->
         5. Research Questions for Investigation
         6. Conclusion (preliminary thoughts)
         
-        Mark sections that need external research with [NEEDS RESEARCH] tags.
-        Use placeholder citations like [SOURCE NEEDED] where external evidence is required.
+        IMPORTANT: You MUST mark multiple areas that need external research with [NEEDS RESEARCH] tags.
+        Every claim that would benefit from external evidence should have [SOURCE NEEDED].
+        This is a preliminary draft - it should have many gaps for iterative improvement.
         
-        This is an initial draft - it should be substantive but acknowledge limitations.
+        Example of proper marking:
+        - "Recent studies show [SOURCE NEEDED] that quantum computing..."
+        - "The economic impact [NEEDS RESEARCH: current market data] is significant..."
+        - "Historical context [NEEDS RESEARCH: specific timeline and events] shows..."
+        
+        Include AT LEAST 5-10 [NEEDS RESEARCH] or [SOURCE NEEDED] tags throughout the draft.
+        Be explicit about what you don't know and what needs external validation.
         """
         
         try:
@@ -639,23 +646,27 @@ def analyze_draft_gaps(self, current_draft: str, original_query: str) -> List[Di
         """
         gap_analysis_prompt = f"""
         Analyze the following research draft to identify specific gaps and areas that need external research.
-        Pay special attention to any placeholder tags like [NEEDS RESEARCH], [SOURCE NEEDED], etc.
+        Be thorough and aggressive in finding areas for improvement - even good drafts can be enhanced.
         
         Original Query: {original_query}
         
         Current Draft:
         {current_draft}
         
-        PRIORITY ANALYSIS:
-        1. First, identify any [NEEDS RESEARCH], [SOURCE NEEDED], [CITATION NEEDED] or similar placeholder tags
-        2. Then identify other substantial gaps in content, evidence, or depth
+        CRITICAL ANALYSIS REQUIRED:
+        1. MANDATORY: Find ALL [NEEDS RESEARCH], [SOURCE NEEDED], [CITATION NEEDED] tags
+        2. Identify claims lacking evidence (even if not explicitly marked)
+        3. Find areas that could benefit from recent data or statistics
+        4. Spot generalizations that need specific examples
+        5. Locate outdated information or areas needing current updates
+        6. Identify missing perspectives or counterarguments
         
         For each gap you identify, provide:
         1. SECTION: Which section has the gap
-        2. GAP_TYPE: [PLACEHOLDER_TAG, MISSING_INFO, OUTDATED_INFO, NEEDS_EVIDENCE, LACKS_DEPTH, NEEDS_EXAMPLES]
+        2. GAP_TYPE: [PLACEHOLDER_TAG, MISSING_INFO, OUTDATED_INFO, NEEDS_EVIDENCE, LACKS_DEPTH, NEEDS_EXAMPLES, MISSING_PERSPECTIVE]
         3. SPECIFIC_NEED: Exactly what information is needed
-        4. SEARCH_QUERY: A specific search query to address this gap
-        5. PRIORITY: [HIGH, MEDIUM, LOW] - HIGH for placeholder tags that need immediate resolution
+        4. SEARCH_QUERY: A specific, targeted search query to address this gap
+        5. PRIORITY: [HIGH, MEDIUM, LOW] - HIGH for placeholder tags and critical missing info
         
         Format each gap as:
         GAP_ID: [number]
@@ -665,7 +676,9 @@ def analyze_draft_gaps(self, current_draft: str, original_query: str) -> List[Di
         SEARCH_QUERY: [search query to find this info]
         PRIORITY: [priority level]
         
-        Identify 3-6 most critical gaps, prioritizing any placeholder tags that need resolution.
+        IMPORTANT: Identify AT LEAST 3-8 gaps. Be critical and thorough.
+        Even well-written sections can benefit from additional evidence, examples, or perspectives.
+        Push for depth, accuracy, and comprehensiveness in the research.
         """
         
         try:
@@ -701,6 +714,8 @@ def analyze_draft_gaps(self, current_draft: str, original_query: str) -> List[Di
                     current_gap['specific_need'] = line.split(':', 1)[1].strip()
                 elif line.startswith('SEARCH_QUERY:'):
                     current_gap['search_query'] = line.split(':', 1)[1].strip()
+                elif line.startswith('PRIORITY:'):
+                    current_gap['priority'] = line.split(':', 1)[1].strip()
             
             if current_gap:
                 gaps.append(current_gap)
@@ -960,6 +975,7 @@ def generate_structured_report(self, system_prompt: str, original_query: str, sy
         8. Group related citations together when appropriate [1,2,3]
         9. Ensure the Executive Summary captures the essence of the entire report
         10. Make recommendations specific and actionable
+        11. DO NOT create a References section - it will be added automatically
         """
         
         try:
@@ -978,6 +994,12 @@ def generate_structured_report(self, system_prompt: str, original_query: str, sy
             report_content = clean_reasoning_tags(report_content)
             self.total_tokens += response.usage.completion_tokens
             
+            # Remove any References section the LLM might have created
+            # This prevents duplicate reference sections
+            report_content = re.sub(r'##\s*References.*?(?=##|\Z)', '', report_content, flags=re.DOTALL)
+            report_content = re.sub(r'(?m)^References\s*\n\s*(?:\[\d+\]\s*\n)+', '', report_content)
+            report_content = re.sub(r'\n\s*\n\s*\n+', '\n\n', report_content)  # Clean up extra newlines
+            
             # Add references section with proper formatting
             references = "\n\n## References\n\n"
             for num, source in sorted(self.citations.items()):
@@ -1132,6 +1154,7 @@ def finalize_research_report(self, system_prompt: str, original_query: str, fina
         - If sections are incomplete, either complete them with available information or remove them entirely
         - Ensure all statements are backed by available evidence or are clearly marked as preliminary findings
         - The report must be publication-ready with no incomplete elements
+        - DO NOT create a References section - it will be added automatically
         
         Return the final polished research report.
         """
@@ -1168,6 +1191,11 @@ def finalize_research_report(self, system_prompt: str, original_query: str, fina
             
             self.total_tokens += response.usage.completion_tokens
             
+            # Remove any References section the LLM might have created
+            polished_report = re.sub(r'##\s*References.*?(?=##|\Z)', '', polished_report, flags=re.DOTALL)
+            polished_report = re.sub(r'(?m)^References\s*\n\s*(?:\[\d+\]\s*\n)+', '', polished_report)
+            polished_report = re.sub(r'\n\s*\n\s*\n+', '\n\n', polished_report)  # Clean up extra newlines
+            
             # Add references section
             references = "\n\n## References\n\n"
             for num, source in sorted(self.citations.items()):
@@ -1179,7 +1207,7 @@ def finalize_research_report(self, system_prompt: str, original_query: str, fina
             # Add TTD-DR metadata
             metadata = "\n---\n\n**TTD-DR Research Metadata:**\n"
             metadata += f"- Algorithm: Test-Time Diffusion Deep Researcher\n"
-            metadata += f"- Denoising iterations: {len(self.draft_history)}\n"
+            metadata += f"- Denoising iterations: {len(self.draft_history) - 1}\n"
             metadata += f"- Total gaps addressed: {sum(len(gaps) for gaps in self.gap_analysis_history)}\n"
             metadata += f"- Component fitness: {self.component_fitness}\n"
             metadata += f"- Total sources consulted: {len(self.citations)}\n"
diff --git a/optillm/plugins/deep_research_plugin.py b/optillm/plugins/deep_research_plugin.py
@@ -16,6 +16,93 @@
 SLUG = "deep_research"
 
 
+class DeepResearchClientWrapper:
+    """Wrapper that adds extended timeout support for different client types"""
+    
+    def __init__(self, client, timeout=1800.0, max_retries=0):
+        self.client = client
+        self.timeout = timeout
+        self.max_retries = max_retries
+        self.client_type = self._detect_client_type()
+        self.chat = self.Chat(self)
+    
+    def _detect_client_type(self):
+        """Detect the type of client based on class name"""
+        class_name = self.client.__class__.__name__
+        module_name = self.client.__class__.__module__
+        
+        # Check for OpenAI-compatible clients (OpenAI, Cerebras, AzureOpenAI)
+        if 'OpenAI' in class_name or 'Cerebras' in class_name:
+            return 'openai_compatible'
+        # Check for LiteLLM wrapper
+        elif 'LiteLLMWrapper' in class_name:
+            return 'litellm'
+        # All other clients (OptILLM inference, etc.)
+        else:
+            return 'other'
+    
+    class Chat:
+        def __init__(self, parent):
+            self.parent = parent
+            self.completions = self.Completions(parent)
+        
+        class Completions:
+            def __init__(self, parent):
+                self.parent = parent
+            
+            def create(self, **kwargs):
+                """Create completion with appropriate timeout handling"""
+                if self.parent.client_type == 'openai_compatible':
+                    # For OpenAI-compatible clients, recreate with timeout
+                    try:
+                        # Import here to avoid circular dependencies
+                        if 'Cerebras' in self.parent.client.__class__.__name__:
+                            from cerebras.cloud.sdk import Cerebras
+                            custom_client = Cerebras(
+                                api_key=self.parent.client.api_key,
+                                base_url=getattr(self.parent.client, 'base_url', None),
+                                timeout=self.parent.timeout,
+                                max_retries=self.parent.max_retries
+                            )
+                        else:
+                            # OpenAI or AzureOpenAI
+                            if 'Azure' in self.parent.client.__class__.__name__:
+                                from openai import AzureOpenAI
+                                # AzureOpenAI has different parameters
+                                custom_client = AzureOpenAI(
+                                    api_key=self.parent.client.api_key,
+                                    api_version=getattr(self.parent.client, 'api_version', None),
+                                    azure_endpoint=getattr(self.parent.client, 'azure_endpoint', None),
+                                    azure_ad_token_provider=getattr(self.parent.client, 'azure_ad_token_provider', None),
+                                    timeout=self.parent.timeout,
+                                    max_retries=self.parent.max_retries
+                                )
+                            else:
+                                from openai import OpenAI
+                                custom_client = OpenAI(
+                                    api_key=self.parent.client.api_key,
+                                    base_url=getattr(self.parent.client, 'base_url', None),
+                                    timeout=self.parent.timeout,
+                                    max_retries=self.parent.max_retries
+                                )
+                        return custom_client.chat.completions.create(**kwargs)
+                    except Exception as e:
+                        # If recreation fails, use original client
+                        print(f"⚠️ Warning: Could not create custom client with timeout: {str(e)}")
+                        return self.parent.client.chat.completions.create(**kwargs)
+                
+                elif self.parent.client_type == 'litellm':
+                    # For LiteLLM, add timeout to the call
+                    kwargs['timeout'] = self.parent.timeout
+                    return self.parent.client.chat.completions.create(**kwargs)
+                
+                else:
+                    # For other clients (like OptILLM), just pass through
+                    # They handle timeouts internally
+                    print(f"ℹ️ Using original client (type: {self.parent.client.__class__.__name__}) without timeout modification")
+                    return self.parent.client.chat.completions.create(**kwargs)
+
+
 def run(system_prompt: str, initial_query: str, client, model: str, request_config: Optional[Dict] = None) -> Tuple[str, int]:
     """
     Deep Research plugin implementing TTD-DR (Test-Time Diffusion Deep Researcher)
@@ -32,16 +119,16 @@ def run(system_prompt: str, initial_query: str, client, model: str, request_conf
         client: OpenAI client for LLM calls
         model: Model name to use for synthesis
         request_config: Optional configuration dict with keys:
-            - max_iterations: Maximum research iterations (default: 8)
-            - max_sources: Maximum web sources per search (default: 15)
+            - max_iterations: Maximum research iterations (default: 5)
+            - max_sources: Maximum web sources per search (default: 30)
     
     Returns:
         Tuple of (comprehensive_research_response, total_completion_tokens)
     """
     # Parse configuration
     config = request_config or {}
-    max_iterations = config.get("max_iterations", 8)  # Increased to 8 for thorough research
-    max_sources = config.get("max_sources", 15)  # Increased to 15 for comprehensive coverage
+    max_iterations = config.get("max_iterations", 5)  # Default to 5 iterations for faster results
+    max_sources = config.get("max_sources", 30)  # Balanced for comprehensive coverage
     
     # Validate inputs
     if not initial_query.strip():
@@ -50,9 +137,13 @@ def run(system_prompt: str, initial_query: str, client, model: str, request_conf
     if not client:
         return "Error: No LLM client provided for research synthesis", 0
     
-    # Initialize researcher
+    # Create a wrapped client with extended timeout for deep research
+    # Deep research can take a long time, so we need 30 minutes timeout and no retries
+    wrapped_client = DeepResearchClientWrapper(client, timeout=1800.0, max_retries=0)
+    
+    # Initialize researcher with wrapped client
     researcher = DeepResearcher(
-        client=client,
+        client=wrapped_client,
         model=model,
         max_iterations=max_iterations,
         max_sources=max_sources
@@ -64,22 +155,5 @@ def run(system_prompt: str, initial_query: str, client, model: str, request_conf
         return result, total_tokens
         
     except Exception as e:
-        error_response = f"Deep research failed: {str(e)}\n\nFalling back to basic response..."
-        
-        # Fallback: provide basic response using just the model
-        try:
-            fallback_response = client.chat.completions.create(
-                model=model,
-                messages=[
-                    {"role": "system", "content": system_prompt},
-                    {"role": "user", "content": initial_query}
-                ]
-            )
-            
-            result = fallback_response.choices[0].message.content.strip()
-            tokens = fallback_response.usage.completion_tokens
-            
-            return f"{error_response}\n\n{result}", tokens
-            
-        except Exception as fallback_error:
-            return f"Deep research and fallback both failed: {str(e)} | {str(fallback_error)}", 0
+        error_message = f"Deep research failed: {str(e)}"
+        return error_message, 0
diff --git a/optillm/plugins/web_search_plugin.py b/optillm/plugins/web_search_plugin.py