Fix AsyncEngine duplication

vzucher · vzucher · commit ff615dc49e09 · 2025-11-24T09:31:55.000Z
diff --git a/src/brightdata/api/scrape_service.py b/src/brightdata/api/scrape_service.py
@@ -47,7 +47,10 @@ def amazon(self):
         """
         if self._amazon is None:
             from ..scrapers.amazon import AmazonScraper
-            self._amazon = AmazonScraper(bearer_token=self._client.token)
+            self._amazon = AmazonScraper(
+                bearer_token=self._client.token,
+                engine=self._client.engine
+            )
         return self._amazon
     
     @property
@@ -73,7 +76,10 @@ def linkedin(self):
         """
         if self._linkedin is None:
             from ..scrapers.linkedin import LinkedInScraper
-            self._linkedin = LinkedInScraper(bearer_token=self._client.token)
+            self._linkedin = LinkedInScraper(
+                bearer_token=self._client.token,
+                engine=self._client.engine
+            )
         return self._linkedin
     
     @property
@@ -96,7 +102,10 @@ def chatgpt(self):
         """
         if self._chatgpt is None:
             from ..scrapers.chatgpt import ChatGPTScraper
-            self._chatgpt = ChatGPTScraper(bearer_token=self._client.token)
+            self._chatgpt = ChatGPTScraper(
+                bearer_token=self._client.token,
+                engine=self._client.engine
+            )
         return self._chatgpt
     
     @property
@@ -132,7 +141,10 @@ def facebook(self):
         """
         if self._facebook is None:
             from ..scrapers.facebook import FacebookScraper
-            self._facebook = FacebookScraper(bearer_token=self._client.token)
+            self._facebook = FacebookScraper(
+                bearer_token=self._client.token,
+                engine=self._client.engine
+            )
         return self._facebook
     
     @property
@@ -166,7 +178,10 @@ def instagram(self):
         """
         if self._instagram is None:
             from ..scrapers.instagram import InstagramScraper
-            self._instagram = InstagramScraper(bearer_token=self._client.token)
+            self._instagram = InstagramScraper(
+                bearer_token=self._client.token,
+                engine=self._client.engine
+            )
         return self._instagram
     
     @property
diff --git a/src/brightdata/api/search_service.py b/src/brightdata/api/search_service.py
@@ -207,7 +207,10 @@ def linkedin(self):
         """
         if self._linkedin_search is None:
             from ..scrapers.linkedin.search import LinkedInSearchScraper
-            self._linkedin_search = LinkedInSearchScraper(bearer_token=self._client.token)
+            self._linkedin_search = LinkedInSearchScraper(
+                bearer_token=self._client.token,
+                engine=self._client.engine
+            )
         return self._linkedin_search
     
     @property
@@ -235,7 +238,10 @@ def chatGPT(self):
         """
         if self._chatgpt_search is None:
             from ..scrapers.chatgpt.search import ChatGPTSearchService
-            self._chatgpt_search = ChatGPTSearchService(bearer_token=self._client.token)
+            self._chatgpt_search = ChatGPTSearchService(
+                bearer_token=self._client.token,
+                engine=self._client.engine
+            )
         return self._chatgpt_search
     
     @property
@@ -264,6 +270,9 @@ def instagram(self):
         """
         if self._instagram_search is None:
             from ..scrapers.instagram.search import InstagramSearchScraper
-            self._instagram_search = InstagramSearchScraper(bearer_token=self._client.token)
+            self._instagram_search = InstagramSearchScraper(
+                bearer_token=self._client.token,
+                engine=self._client.engine
+            )
         return self._instagram_search
 
diff --git a/src/brightdata/scrapers/base.py b/src/brightdata/scrapers/base.py
@@ -61,12 +61,15 @@ class BaseWebScraper(ABC):
     MIN_POLL_TIMEOUT: int = DEFAULT_MIN_POLL_TIMEOUT
     COST_PER_RECORD: float = DEFAULT_COST_PER_RECORD
     
-    def __init__(self, bearer_token: Optional[str] = None):
+    def __init__(self, bearer_token: Optional[str] = None, engine: Optional[AsyncEngine] = None):
         """
         Initialize platform scraper.
         
         Args:
             bearer_token: Bright Data API token. If None, loads from environment.
+            engine: Optional AsyncEngine instance. If provided, reuses the existing engine
+                   (recommended when using via client to share connection pool and rate limiter).
+                   If None, creates a new engine (for standalone usage).
         
         Raises:
             ValidationError: If token not provided and not in environment
@@ -78,7 +81,8 @@ def __init__(self, bearer_token: Optional[str] = None):
                 f"Provide bearer_token parameter or set BRIGHTDATA_API_TOKEN environment variable."
             )
         
-        self.engine = AsyncEngine(self.bearer_token)
+        # Reuse engine if provided (for resource efficiency), otherwise create new one
+        self.engine = engine if engine is not None else AsyncEngine(self.bearer_token)
         self.api_client = DatasetAPIClient(self.engine)
         self.workflow_executor = WorkflowExecutor(
             api_client=self.api_client,
diff --git a/tests/unit/test_engine_sharing.py b/tests/unit/test_engine_sharing.py
@@ -0,0 +1,218 @@
+"""
+Test script to verify AsyncEngine sharing across scrapers.
+
+This script verifies that the AsyncEngine duplication fix works correctly by:
+1. Counting AsyncEngine instances before/after creating client
+2. Accessing multiple scrapers and verifying only one engine exists
+3. Ensuring resource efficiency and proper engine reuse
+
+Expected output:
+- Before creating client: 0 engines
+- After creating client: 1 engine
+- After accessing all scrapers: 1 engine (SHOULD STILL BE 1)
+
+If this test passes, the fix is working correctly!
+"""
+
+import gc
+import sys
+import os
+
+# Add src to path so we can import brightdata
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+
+from brightdata import BrightDataClient
+from brightdata.core.engine import AsyncEngine
+
+
+def count_engines():
+    """Count the number of AsyncEngine instances in memory."""
+    gc.collect()  # Force garbage collection to get accurate count
+    engines = [obj for obj in gc.get_objects() if isinstance(obj, AsyncEngine)]
+    return len(engines)
+
+
+def test_engine_sharing():
+    """Test that only one engine is created and shared across all scrapers."""
+    
+    print("=" * 70)
+    print("AsyncEngine Sharing Test")
+    print("=" * 70)
+    print()
+    
+    # Step 1: Check baseline (should be 0)
+    initial_count = count_engines()
+    print(f"✓ Step 1: Before creating client: {initial_count} engine(s)")
+    
+    if initial_count != 0:
+        print(f"  ⚠️  Warning: Expected 0 engines, found {initial_count}")
+    print()
+    
+    # Step 2: Create client (should create 1 engine)
+    print("✓ Step 2: Creating BrightDataClient...")
+    
+    # Try to load token from environment, or use placeholder
+    token = os.getenv("BRIGHTDATA_API_TOKEN")
+    if not token:
+        print("  ⚠️  Warning: No BRIGHTDATA_API_TOKEN found, using placeholder")
+        token = "test_token_placeholder_12345"
+    
+    client = BrightDataClient(token=token)
+    
+    after_client_count = count_engines()
+    print(f"✓ Step 3: After creating client: {after_client_count} engine(s)")
+    
+    if after_client_count != 1:
+        print(f"  ❌ FAILED: Expected 1 engine, found {after_client_count}")
+        return False
+    print()
+    
+    # Step 3: Access all scrapers (should still be 1 engine)
+    print("✓ Step 4: Accessing all scrapers...")
+    
+    scrapers_accessed = []
+    
+    try:
+        # Access scrape services
+        _ = client.scrape.amazon
+        scrapers_accessed.append("amazon")
+        
+        _ = client.scrape.linkedin
+        scrapers_accessed.append("linkedin")
+        
+        _ = client.scrape.facebook
+        scrapers_accessed.append("facebook")
+        
+        _ = client.scrape.instagram
+        scrapers_accessed.append("instagram")
+        
+        _ = client.scrape.chatgpt
+        scrapers_accessed.append("chatgpt")
+        
+        # Access search services
+        _ = client.search.linkedin
+        scrapers_accessed.append("search.linkedin")
+        
+        _ = client.search.instagram
+        scrapers_accessed.append("search.instagram")
+        
+        _ = client.search.chatGPT
+        scrapers_accessed.append("search.chatGPT")
+        
+        print(f"  Accessed {len(scrapers_accessed)} scrapers: {', '.join(scrapers_accessed)}")
+        
+    except Exception as e:
+        print(f"  ⚠️  Warning: Error accessing scrapers: {e}")
+    
+    print()
+    
+    # Step 4: Count engines after accessing all scrapers
+    after_scrapers_count = count_engines()
+    print(f"✓ Step 5: After accessing all scrapers: {after_scrapers_count} engine(s)")
+    print()
+    
+    # Verify the result
+    print("=" * 70)
+    print("Test Results")
+    print("=" * 70)
+    
+    if after_scrapers_count == 1:
+        print("✅ SUCCESS! Only 1 AsyncEngine instance exists.")
+        print("   All scrapers are sharing the client's engine.")
+        print("   Resource efficiency: OPTIMAL")
+        print()
+        print("   Benefits:")
+        print("   • Single HTTP connection pool")
+        print("   • Unified rate limiting")
+        print("   • Reduced memory usage")
+        print("   • Better connection reuse")
+        return True
+    else:
+        print(f"❌ FAILED! Found {after_scrapers_count} AsyncEngine instances.")
+        print("   Expected: 1 engine (shared across all scrapers)")
+        print(f"   Actual: {after_scrapers_count} engines (resource duplication)")
+        print()
+        print("   This means:")
+        print("   • Multiple connection pools created")
+        print("   • Inefficient resource usage")
+        print("   • Engine duplication not fixed")
+        return False
+
+
+def test_standalone_scraper():
+    """Test that standalone scrapers still work (backwards compatibility)."""
+    
+    print()
+    print("=" * 70)
+    print("Standalone Scraper Test (Backwards Compatibility)")
+    print("=" * 70)
+    print()
+    
+    # Clear any existing engines
+    gc.collect()
+    initial_count = count_engines()
+    
+    print(f"✓ Initial engine count: {initial_count}")
+    
+    # Import and create a standalone scraper
+    from brightdata.scrapers.amazon import AmazonScraper
+    
+    print("✓ Creating standalone AmazonScraper (without passing engine)...")
+    
+    try:
+        token = os.getenv("BRIGHTDATA_API_TOKEN", "test_token_placeholder_12345")
+        scraper = AmazonScraper(bearer_token=token)
+        
+        standalone_count = count_engines()
+        print(f"✓ After creating standalone scraper: {standalone_count} engine(s)")
+        
+        expected_count = initial_count + 1
+        if standalone_count == expected_count:
+            print("✅ SUCCESS! Standalone scraper creates its own engine.")
+            print("   Backwards compatibility: MAINTAINED")
+            return True
+        else:
+            print(f"❌ FAILED! Expected {expected_count} engines, found {standalone_count}")
+            return False
+            
+    except Exception as e:
+        print(f"⚠️  Warning: Could not create standalone scraper: {e}")
+        print("   (This is expected if bearer token is missing)")
+        return True  # Don't fail the test if token is missing
+
+
+if __name__ == "__main__":
+    print()
+    print("╔" + "═" * 68 + "╗")
+    print("║" + " " * 15 + "AsyncEngine Duplication Fix Test" + " " * 20 + "║")
+    print("╚" + "═" * 68 + "╝")
+    print()
+    
+    # Run both tests
+    test1_passed = test_engine_sharing()
+    test2_passed = test_standalone_scraper()
+    
+    print()
+    print("=" * 70)
+    print("Final Results")
+    print("=" * 70)
+    print()
+    
+    if test1_passed and test2_passed:
+        print("✅ ALL TESTS PASSED!")
+        print()
+        print("The AsyncEngine duplication fix is working correctly:")
+        print("• Single engine shared across all client scrapers ✓")
+        print("• Standalone scrapers still create their own engine ✓")
+        print("• Backwards compatibility maintained ✓")
+        print("• Resource efficiency achieved ✓")
+        sys.exit(0)
+    else:
+        print("❌ SOME TESTS FAILED")
+        print()
+        if not test1_passed:
+            print("• Engine sharing test failed - duplication still exists")
+        if not test2_passed:
+            print("• Standalone scraper test failed - backwards compatibility broken")
+        sys.exit(1)
+