Add browser session manager for web searches

codelion · codelion · commit e08d4ab0c01f · 2025-07-24T22:25:08.000+08:00
Introduces BrowserSessionManager to enable reuse of a single browser session across multiple web searches, improving efficiency and reliability. DeepResearcher now uses a shared browser session for all search operations within a research run, and web_search_plugin's run function supports session reuse via the new manager.
diff --git a/optillm/plugins/deep_research/research_engine.py b/optillm/plugins/deep_research/research_engine.py
@@ -14,7 +14,7 @@
 from typing import Tuple, List, Dict, Optional, Any
 from datetime import datetime
 from collections import defaultdict
-from optillm.plugins.web_search_plugin import run as web_search_run
+from optillm.plugins.web_search_plugin import run as web_search_run, BrowserSessionManager
 from optillm.plugins.readurls_plugin import run as readurls_run
 from optillm.plugins.memory_plugin import run as memory_run
 
@@ -250,6 +250,7 @@ def __init__(self, client, model: str, max_iterations: int = 8, max_sources: int
             "integration_ability": 1.0
         }
         self.gap_analysis_history = []  # Track identified gaps over time
+        self.session_manager = None  # Browser session manager for web searches
     
     def cleanup_placeholder_tags(self, text: str) -> str:
         """
@@ -386,6 +387,10 @@ def perform_web_search(self, queries: List[str]) -> str:
         """
         all_results = []
         
+        # Check if session manager is available
+        if not hasattr(self, 'session_manager'):
+            self.session_manager = None
+        
         # Perform individual searches for each query to avoid truncation issues
         for i, query in enumerate(queries):
             try:
@@ -398,7 +403,8 @@ def perform_web_search(self, queries: List[str]) -> str:
                 enhanced_query, _ = web_search_run("", search_query, None, None, {
                     "num_results": results_per_query,
                     "delay_seconds": None,  # Use default random delay (4-32 seconds)
-                    "headless": False  # Allow CAPTCHA solving if needed
+                    "headless": False,  # Allow CAPTCHA solving if needed
+                    "session_manager": self.session_manager  # Use shared browser session
                 })
                 
                 if enhanced_query and "Web Search Results" in enhanced_query:
@@ -710,6 +716,10 @@ def perform_gap_targeted_search(self, gaps: List[Dict[str, str]]) -> str:
         """
         all_results = []
         
+        # Check if session manager is available
+        if not hasattr(self, 'session_manager'):
+            self.session_manager = None
+        
         # Sort gaps by priority - HIGH priority first (placeholder tags)
         sorted_gaps = sorted(gaps, key=lambda g: (
             0 if g.get('priority', '').upper() == 'HIGH' else
@@ -729,7 +739,8 @@ def perform_gap_targeted_search(self, gaps: List[Dict[str, str]]) -> str:
                 enhanced_query, _ = web_search_run("", search_query, None, None, {
                     "num_results": max(1, self.max_sources // len(gaps)),
                     "delay_seconds": None,  # Use default random delay (4-32 seconds)
-                    "headless": False
+                    "headless": False,
+                    "session_manager": self.session_manager  # Use shared browser session
                 })
                 
                 if enhanced_query and "Web Search Results" in enhanced_query:
@@ -995,77 +1006,87 @@ def research(self, system_prompt: str, initial_query: str) -> Tuple[str, int]:
         4. Quality-guided termination
         """
         
-        # PHASE 1: INITIALIZATION - Generate preliminary draft (updatable skeleton)
-        print("TTD-DR: Generating preliminary draft...")
-        self.current_draft = self.generate_preliminary_draft(system_prompt, initial_query)
-        self.draft_history.append(self.current_draft)
-        
-        # PHASE 2: ITERATIVE DENOISING LOOP
-        for iteration in range(self.max_iterations):
-            self.research_state["iteration"] = iteration + 1
-            print(f"TTD-DR: Denoising iteration {iteration + 1}/{self.max_iterations}")
-            
-            # STEP 1: Analyze current draft for gaps (draft-guided search)
-            print("  - Analyzing draft gaps...")
-            gaps = self.analyze_draft_gaps(self.current_draft, initial_query)
-            self.gap_analysis_history.append(gaps)
-            
-            if not gaps:
-                print("  - No significant gaps found, research complete")
-                break
-            
-            # STEP 2: Perform gap-targeted retrieval
-            print(f"  - Performing targeted search for {len(gaps)} gaps...")
-            retrieval_content = self.perform_gap_targeted_search(gaps)
-            
-            # STEP 3: Extract and fetch URLs from search results
-            print("  - Extracting and fetching content...")
-            content_with_urls, sources = self.extract_and_fetch_urls(retrieval_content)
-            
-            # Register sources for citations
-            for source in sources:
-                if 'url' in source:
-                    self.citation_counter += 1
-                    self.citations[self.citation_counter] = source
-            
-            # STEP 4: DENOISING - Integrate retrieved info with current draft
-            print("  - Performing denoising step...")
-            previous_draft = self.current_draft
-            self.current_draft = self.denoise_draft_with_retrieval(
-                self.current_draft, content_with_urls, initial_query
-            )
-            self.draft_history.append(self.current_draft)
-            
-            # STEP 5: Evaluate quality improvement
-            print("  - Evaluating draft quality...")
-            quality_scores = self.evaluate_draft_quality(
-                self.current_draft, previous_draft, initial_query
-            )
-            
-            # STEP 6: Component self-evolution based on feedback
-            self.update_component_fitness(quality_scores)
-            
-            # STEP 7: Check termination conditions
-            completeness = quality_scores.get('completeness', 0.0)
-            improvement = quality_scores.get('improvement', 0.0)
-            
-            print(f"  - Quality scores: Completeness={completeness:.2f}, Improvement={improvement:.2f}")
-            
-            # Terminate if high quality achieved or minimal improvement
-            # More lenient termination to ensure complete research
-            if completeness > 0.9 or (improvement < 0.03 and completeness > 0.7):
-                print("  - Quality threshold reached, research complete")
-                break
+        # Use a single browser session for all searches in this research
+        with BrowserSessionManager(headless=False, timeout=30) as session_manager:
+            print("🔬 Starting deep research with single browser session")
+            self.session_manager = session_manager  # Store for use in search methods
             
-            # Store current state for tracking
-            self.research_state["content"].append(content_with_urls)
-            self.research_state["sources"].extend([s['url'] for s in sources if 'url' in s])
-        
-        # PHASE 3: FINALIZATION - Polish the final draft
-        print("TTD-DR: Finalizing research report...")
-        final_report = self.finalize_research_report(system_prompt, initial_query, self.current_draft)
-        
-        return final_report, self.total_tokens
+            try:
+                # PHASE 1: INITIALIZATION - Generate preliminary draft (updatable skeleton)
+                print("TTD-DR: Generating preliminary draft...")
+                self.current_draft = self.generate_preliminary_draft(system_prompt, initial_query)
+                self.draft_history.append(self.current_draft)
+        
+                # PHASE 2: ITERATIVE DENOISING LOOP
+                for iteration in range(self.max_iterations):
+                    self.research_state["iteration"] = iteration + 1
+                    print(f"TTD-DR: Denoising iteration {iteration + 1}/{self.max_iterations}")
+                    
+                    # STEP 1: Analyze current draft for gaps (draft-guided search)
+                    print("  - Analyzing draft gaps...")
+                    gaps = self.analyze_draft_gaps(self.current_draft, initial_query)
+                    self.gap_analysis_history.append(gaps)
+                    
+                    if not gaps:
+                        print("  - No significant gaps found, research complete")
+                        break
+                    
+                    # STEP 2: Perform gap-targeted retrieval
+                    print(f"  - Performing targeted search for {len(gaps)} gaps...")
+                    retrieval_content = self.perform_gap_targeted_search(gaps)
+                    
+                    # STEP 3: Extract and fetch URLs from search results
+                    print("  - Extracting and fetching content...")
+                    content_with_urls, sources = self.extract_and_fetch_urls(retrieval_content)
+                    
+                    # Register sources for citations
+                    for source in sources:
+                        if 'url' in source:
+                            self.citation_counter += 1
+                            self.citations[self.citation_counter] = source
+                    
+                    # STEP 4: DENOISING - Integrate retrieved info with current draft
+                    print("  - Performing denoising step...")
+                    previous_draft = self.current_draft
+                    self.current_draft = self.denoise_draft_with_retrieval(
+                        self.current_draft, content_with_urls, initial_query
+                    )
+                    self.draft_history.append(self.current_draft)
+                    
+                    # STEP 5: Evaluate quality improvement
+                    print("  - Evaluating draft quality...")
+                    quality_scores = self.evaluate_draft_quality(
+                        self.current_draft, previous_draft, initial_query
+                    )
+                    
+                    # STEP 6: Component self-evolution based on feedback
+                    self.update_component_fitness(quality_scores)
+                    
+                    # STEP 7: Check termination conditions
+                    completeness = quality_scores.get('completeness', 0.0)
+                    improvement = quality_scores.get('improvement', 0.0)
+                    
+                    print(f"  - Quality scores: Completeness={completeness:.2f}, Improvement={improvement:.2f}")
+                    
+                    # Terminate if high quality achieved or minimal improvement
+                    # More lenient termination to ensure complete research
+                    if completeness > 0.9 or (improvement < 0.03 and completeness > 0.7):
+                        print("  - Quality threshold reached, research complete")
+                        break
+                    
+                    # Store current state for tracking
+                    self.research_state["content"].append(content_with_urls)
+                    self.research_state["sources"].extend([s['url'] for s in sources if 'url' in s])
+                
+                # PHASE 3: FINALIZATION - Polish the final draft
+                print("TTD-DR: Finalizing research report...")
+                final_report = self.finalize_research_report(system_prompt, initial_query, self.current_draft)
+                
+                return final_report, self.total_tokens
+                
+            finally:
+                # Clean up session manager reference
+                self.session_manager = None
     
     def finalize_research_report(self, system_prompt: str, original_query: str, final_draft: str) -> str:
         """
diff --git a/optillm/plugins/web_search_plugin.py b/optillm/plugins/web_search_plugin.py
@@ -17,6 +17,67 @@
 
 SLUG = "web_search"
 
+
+class BrowserSessionManager:
+    """
+    Manages a single browser session across multiple searches.
+    Implements context manager for automatic cleanup.
+    """
+    def __init__(self, headless: bool = False, timeout: int = 30):
+        self.headless = headless
+        self.timeout = timeout
+        self._searcher = None
+        self._search_count = 0
+        self._session_start_time = None
+    
+    def __enter__(self):
+        """Context manager entry - ensures browser is ready"""
+        self.get_or_create_searcher()
+        self._session_start_time = time.time()
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit - ensures browser cleanup"""
+        self.close()
+        return False  # Don't suppress exceptions
+    
+    def get_or_create_searcher(self) -> 'GoogleSearcher':
+        """Get existing searcher or create a new one"""
+        if self._searcher is None:
+            print("🌐 Creating new browser session for research...")
+            self._searcher = GoogleSearcher(
+                headless=self.headless,
+                timeout=self.timeout
+            )
+        return self._searcher
+    
+    def search(self, query: str, num_results: int = 10, delay_seconds: Optional[int] = None) -> List[Dict[str, str]]:
+        """Perform a search using the managed browser session"""
+        searcher = self.get_or_create_searcher()
+        self._search_count += 1
+        print(f"🔍 Search #{self._search_count} in current session: {query[:50]}...")
+        return searcher.search(query, num_results, delay_seconds)
+    
+    def close(self):
+        """Close the browser session"""
+        if self._searcher is not None:
+            try:
+                self._searcher.close()
+                if self._session_start_time:
+                    duration = time.time() - self._session_start_time
+                    print(f"🏁 Browser session closed after {self._search_count} searches ({duration:.1f}s)")
+            except Exception as e:
+                print(f"⚠️ Error closing browser session: {e}")
+            finally:
+                self._searcher = None
+                self._search_count = 0
+                self._session_start_time = None
+    
+    def is_active(self) -> bool:
+        """Check if browser session is active"""
+        return self._searcher is not None and self._searcher.driver is not None
+
+
 class GoogleSearcher:
     def __init__(self, headless: bool = False, timeout: int = 30):
         self.timeout = timeout
@@ -459,31 +520,44 @@ def run(system_prompt: str, initial_query: str, client=None, model: str = None,
                             Set to 0 to disable delays, or specify exact seconds
             - headless: Run browser in headless mode (default: False)
             - timeout: Browser timeout in seconds (default: 30)
+            - session_manager: BrowserSessionManager instance for session reuse
     
     Returns:
         Tuple of (enhanced_query_with_search_results, completion_tokens)
     """
     # Parse configuration
     config = request_config or {}
     num_results = config.get("num_results", 10)
-    delay_seconds = config.get("delay_seconds", None)  # None means random 32-128
+    delay_seconds = config.get("delay_seconds", None)  # None means random 4-32
     headless = config.get("headless", False)  # Default to non-headless
     timeout = config.get("timeout", 30)  # Standard timeout
+    session_manager = config.get("session_manager", None)  # For session reuse
     
     # Extract search queries from the input
     search_queries = extract_search_queries(initial_query)
     
     if not search_queries:
         return initial_query, 0
     
-    searcher = None
+    # Determine if we should manage the browser lifecycle
+    own_session = session_manager is None
+    
     try:
-        searcher = GoogleSearcher(headless=headless, timeout=timeout)
+        # Use provided session manager or create temporary one
+        if own_session:
+            # Create temporary searcher for standalone use
+            searcher = GoogleSearcher(headless=headless, timeout=timeout)
+        
         enhanced_query = initial_query
         
         for query in search_queries:
             # Perform the search
-            results = searcher.search(query, num_results=num_results, delay_seconds=delay_seconds)
+            if session_manager:
+                # Use session manager's search method
+                results = session_manager.search(query, num_results=num_results, delay_seconds=delay_seconds)
+            else:
+                # Use temporary searcher
+                results = searcher.search(query, num_results=num_results, delay_seconds=delay_seconds)
             
             # Format results
             if results:
@@ -502,5 +576,6 @@ def run(system_prompt: str, initial_query: str, client=None, model: str = None,
         return enhanced_query, 0
         
     finally:
-        if searcher:
+        # Only close if we created our own searcher
+        if own_session and 'searcher' in locals():
             searcher.close()