Skip to content

Commit ff615dc

Browse files
committed
Fix AsyncEngine duplication
1 parent 657edc0 commit ff615dc

File tree

4 files changed

+256
-10
lines changed

4 files changed

+256
-10
lines changed

src/brightdata/api/scrape_service.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,10 @@ def amazon(self):
4747
"""
4848
if self._amazon is None:
4949
from ..scrapers.amazon import AmazonScraper
50-
self._amazon = AmazonScraper(bearer_token=self._client.token)
50+
self._amazon = AmazonScraper(
51+
bearer_token=self._client.token,
52+
engine=self._client.engine
53+
)
5154
return self._amazon
5255

5356
@property
@@ -73,7 +76,10 @@ def linkedin(self):
7376
"""
7477
if self._linkedin is None:
7578
from ..scrapers.linkedin import LinkedInScraper
76-
self._linkedin = LinkedInScraper(bearer_token=self._client.token)
79+
self._linkedin = LinkedInScraper(
80+
bearer_token=self._client.token,
81+
engine=self._client.engine
82+
)
7783
return self._linkedin
7884

7985
@property
@@ -96,7 +102,10 @@ def chatgpt(self):
96102
"""
97103
if self._chatgpt is None:
98104
from ..scrapers.chatgpt import ChatGPTScraper
99-
self._chatgpt = ChatGPTScraper(bearer_token=self._client.token)
105+
self._chatgpt = ChatGPTScraper(
106+
bearer_token=self._client.token,
107+
engine=self._client.engine
108+
)
100109
return self._chatgpt
101110

102111
@property
@@ -132,7 +141,10 @@ def facebook(self):
132141
"""
133142
if self._facebook is None:
134143
from ..scrapers.facebook import FacebookScraper
135-
self._facebook = FacebookScraper(bearer_token=self._client.token)
144+
self._facebook = FacebookScraper(
145+
bearer_token=self._client.token,
146+
engine=self._client.engine
147+
)
136148
return self._facebook
137149

138150
@property
@@ -166,7 +178,10 @@ def instagram(self):
166178
"""
167179
if self._instagram is None:
168180
from ..scrapers.instagram import InstagramScraper
169-
self._instagram = InstagramScraper(bearer_token=self._client.token)
181+
self._instagram = InstagramScraper(
182+
bearer_token=self._client.token,
183+
engine=self._client.engine
184+
)
170185
return self._instagram
171186

172187
@property

src/brightdata/api/search_service.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,10 @@ def linkedin(self):
207207
"""
208208
if self._linkedin_search is None:
209209
from ..scrapers.linkedin.search import LinkedInSearchScraper
210-
self._linkedin_search = LinkedInSearchScraper(bearer_token=self._client.token)
210+
self._linkedin_search = LinkedInSearchScraper(
211+
bearer_token=self._client.token,
212+
engine=self._client.engine
213+
)
211214
return self._linkedin_search
212215

213216
@property
@@ -235,7 +238,10 @@ def chatGPT(self):
235238
"""
236239
if self._chatgpt_search is None:
237240
from ..scrapers.chatgpt.search import ChatGPTSearchService
238-
self._chatgpt_search = ChatGPTSearchService(bearer_token=self._client.token)
241+
self._chatgpt_search = ChatGPTSearchService(
242+
bearer_token=self._client.token,
243+
engine=self._client.engine
244+
)
239245
return self._chatgpt_search
240246

241247
@property
@@ -264,6 +270,9 @@ def instagram(self):
264270
"""
265271
if self._instagram_search is None:
266272
from ..scrapers.instagram.search import InstagramSearchScraper
267-
self._instagram_search = InstagramSearchScraper(bearer_token=self._client.token)
273+
self._instagram_search = InstagramSearchScraper(
274+
bearer_token=self._client.token,
275+
engine=self._client.engine
276+
)
268277
return self._instagram_search
269278

src/brightdata/scrapers/base.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,15 @@ class BaseWebScraper(ABC):
6161
MIN_POLL_TIMEOUT: int = DEFAULT_MIN_POLL_TIMEOUT
6262
COST_PER_RECORD: float = DEFAULT_COST_PER_RECORD
6363

64-
def __init__(self, bearer_token: Optional[str] = None):
64+
def __init__(self, bearer_token: Optional[str] = None, engine: Optional[AsyncEngine] = None):
6565
"""
6666
Initialize platform scraper.
6767
6868
Args:
6969
bearer_token: Bright Data API token. If None, loads from environment.
70+
engine: Optional AsyncEngine instance. If provided, reuses the existing engine
71+
(recommended when using via client to share connection pool and rate limiter).
72+
If None, creates a new engine (for standalone usage).
7073
7174
Raises:
7275
ValidationError: If token not provided and not in environment
@@ -78,7 +81,8 @@ def __init__(self, bearer_token: Optional[str] = None):
7881
f"Provide bearer_token parameter or set BRIGHTDATA_API_TOKEN environment variable."
7982
)
8083

81-
self.engine = AsyncEngine(self.bearer_token)
84+
# Reuse engine if provided (for resource efficiency), otherwise create new one
85+
self.engine = engine if engine is not None else AsyncEngine(self.bearer_token)
8286
self.api_client = DatasetAPIClient(self.engine)
8387
self.workflow_executor = WorkflowExecutor(
8488
api_client=self.api_client,

tests/unit/test_engine_sharing.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
"""
2+
Test script to verify AsyncEngine sharing across scrapers.
3+
4+
This script verifies that the AsyncEngine duplication fix works correctly by:
5+
1. Counting AsyncEngine instances before/after creating client
6+
2. Accessing multiple scrapers and verifying only one engine exists
7+
3. Ensuring resource efficiency and proper engine reuse
8+
9+
Expected output:
10+
- Before creating client: 0 engines
11+
- After creating client: 1 engine
12+
- After accessing all scrapers: 1 engine (SHOULD STILL BE 1)
13+
14+
If this test passes, the fix is working correctly!
15+
"""
16+
17+
import gc
18+
import sys
19+
import os
20+
21+
# Add src to path so we can import brightdata
22+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
23+
24+
from brightdata import BrightDataClient
25+
from brightdata.core.engine import AsyncEngine
26+
27+
28+
def count_engines():
29+
"""Count the number of AsyncEngine instances in memory."""
30+
gc.collect() # Force garbage collection to get accurate count
31+
engines = [obj for obj in gc.get_objects() if isinstance(obj, AsyncEngine)]
32+
return len(engines)
33+
34+
35+
def test_engine_sharing():
36+
"""Test that only one engine is created and shared across all scrapers."""
37+
38+
print("=" * 70)
39+
print("AsyncEngine Sharing Test")
40+
print("=" * 70)
41+
print()
42+
43+
# Step 1: Check baseline (should be 0)
44+
initial_count = count_engines()
45+
print(f"✓ Step 1: Before creating client: {initial_count} engine(s)")
46+
47+
if initial_count != 0:
48+
print(f" ⚠️ Warning: Expected 0 engines, found {initial_count}")
49+
print()
50+
51+
# Step 2: Create client (should create 1 engine)
52+
print("✓ Step 2: Creating BrightDataClient...")
53+
54+
# Try to load token from environment, or use placeholder
55+
token = os.getenv("BRIGHTDATA_API_TOKEN")
56+
if not token:
57+
print(" ⚠️ Warning: No BRIGHTDATA_API_TOKEN found, using placeholder")
58+
token = "test_token_placeholder_12345"
59+
60+
client = BrightDataClient(token=token)
61+
62+
after_client_count = count_engines()
63+
print(f"✓ Step 3: After creating client: {after_client_count} engine(s)")
64+
65+
if after_client_count != 1:
66+
print(f" ❌ FAILED: Expected 1 engine, found {after_client_count}")
67+
return False
68+
print()
69+
70+
# Step 3: Access all scrapers (should still be 1 engine)
71+
print("✓ Step 4: Accessing all scrapers...")
72+
73+
scrapers_accessed = []
74+
75+
try:
76+
# Access scrape services
77+
_ = client.scrape.amazon
78+
scrapers_accessed.append("amazon")
79+
80+
_ = client.scrape.linkedin
81+
scrapers_accessed.append("linkedin")
82+
83+
_ = client.scrape.facebook
84+
scrapers_accessed.append("facebook")
85+
86+
_ = client.scrape.instagram
87+
scrapers_accessed.append("instagram")
88+
89+
_ = client.scrape.chatgpt
90+
scrapers_accessed.append("chatgpt")
91+
92+
# Access search services
93+
_ = client.search.linkedin
94+
scrapers_accessed.append("search.linkedin")
95+
96+
_ = client.search.instagram
97+
scrapers_accessed.append("search.instagram")
98+
99+
_ = client.search.chatGPT
100+
scrapers_accessed.append("search.chatGPT")
101+
102+
print(f" Accessed {len(scrapers_accessed)} scrapers: {', '.join(scrapers_accessed)}")
103+
104+
except Exception as e:
105+
print(f" ⚠️ Warning: Error accessing scrapers: {e}")
106+
107+
print()
108+
109+
# Step 4: Count engines after accessing all scrapers
110+
after_scrapers_count = count_engines()
111+
print(f"✓ Step 5: After accessing all scrapers: {after_scrapers_count} engine(s)")
112+
print()
113+
114+
# Verify the result
115+
print("=" * 70)
116+
print("Test Results")
117+
print("=" * 70)
118+
119+
if after_scrapers_count == 1:
120+
print("✅ SUCCESS! Only 1 AsyncEngine instance exists.")
121+
print(" All scrapers are sharing the client's engine.")
122+
print(" Resource efficiency: OPTIMAL")
123+
print()
124+
print(" Benefits:")
125+
print(" • Single HTTP connection pool")
126+
print(" • Unified rate limiting")
127+
print(" • Reduced memory usage")
128+
print(" • Better connection reuse")
129+
return True
130+
else:
131+
print(f"❌ FAILED! Found {after_scrapers_count} AsyncEngine instances.")
132+
print(" Expected: 1 engine (shared across all scrapers)")
133+
print(f" Actual: {after_scrapers_count} engines (resource duplication)")
134+
print()
135+
print(" This means:")
136+
print(" • Multiple connection pools created")
137+
print(" • Inefficient resource usage")
138+
print(" • Engine duplication not fixed")
139+
return False
140+
141+
142+
def test_standalone_scraper():
143+
"""Test that standalone scrapers still work (backwards compatibility)."""
144+
145+
print()
146+
print("=" * 70)
147+
print("Standalone Scraper Test (Backwards Compatibility)")
148+
print("=" * 70)
149+
print()
150+
151+
# Clear any existing engines
152+
gc.collect()
153+
initial_count = count_engines()
154+
155+
print(f"✓ Initial engine count: {initial_count}")
156+
157+
# Import and create a standalone scraper
158+
from brightdata.scrapers.amazon import AmazonScraper
159+
160+
print("✓ Creating standalone AmazonScraper (without passing engine)...")
161+
162+
try:
163+
token = os.getenv("BRIGHTDATA_API_TOKEN", "test_token_placeholder_12345")
164+
scraper = AmazonScraper(bearer_token=token)
165+
166+
standalone_count = count_engines()
167+
print(f"✓ After creating standalone scraper: {standalone_count} engine(s)")
168+
169+
expected_count = initial_count + 1
170+
if standalone_count == expected_count:
171+
print("✅ SUCCESS! Standalone scraper creates its own engine.")
172+
print(" Backwards compatibility: MAINTAINED")
173+
return True
174+
else:
175+
print(f"❌ FAILED! Expected {expected_count} engines, found {standalone_count}")
176+
return False
177+
178+
except Exception as e:
179+
print(f"⚠️ Warning: Could not create standalone scraper: {e}")
180+
print(" (This is expected if bearer token is missing)")
181+
return True # Don't fail the test if token is missing
182+
183+
184+
if __name__ == "__main__":
185+
print()
186+
print("╔" + "═" * 68 + "╗")
187+
print("║" + " " * 15 + "AsyncEngine Duplication Fix Test" + " " * 20 + "║")
188+
print("╚" + "═" * 68 + "╝")
189+
print()
190+
191+
# Run both tests
192+
test1_passed = test_engine_sharing()
193+
test2_passed = test_standalone_scraper()
194+
195+
print()
196+
print("=" * 70)
197+
print("Final Results")
198+
print("=" * 70)
199+
print()
200+
201+
if test1_passed and test2_passed:
202+
print("✅ ALL TESTS PASSED!")
203+
print()
204+
print("The AsyncEngine duplication fix is working correctly:")
205+
print("• Single engine shared across all client scrapers ✓")
206+
print("• Standalone scrapers still create their own engine ✓")
207+
print("• Backwards compatibility maintained ✓")
208+
print("• Resource efficiency achieved ✓")
209+
sys.exit(0)
210+
else:
211+
print("❌ SOME TESTS FAILED")
212+
print()
213+
if not test1_passed:
214+
print("• Engine sharing test failed - duplication still exists")
215+
if not test2_passed:
216+
print("• Standalone scraper test failed - backwards compatibility broken")
217+
sys.exit(1)
218+

0 commit comments

Comments
 (0)