Skip to content

Commit b595e59

Browse files
committed
feat/ added new trigger interface
1 parent 23436a6 commit b595e59

File tree

11 files changed

+1416
-11
lines changed

11 files changed

+1416
-11
lines changed

examples/11_trigger_interface.py

Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
"""
2+
Example: Manual Trigger/Poll/Fetch Interface
3+
4+
Demonstrates how to use the new trigger interface for manual control
5+
over the scrape lifecycle: trigger -> status -> fetch.
6+
7+
Use cases:
8+
- Start multiple scrapes concurrently
9+
- Custom polling logic
10+
- Save job IDs for later retrieval
11+
- Optimize cost and timing
12+
13+
Run: python examples/11_trigger_interface.py
14+
"""
15+
16+
import asyncio
17+
import time
18+
from brightdata import BrightDataClient
19+
20+
21+
# ============================================================================
22+
# Example 1: Basic Trigger/Poll/Fetch Pattern
23+
# ============================================================================
24+
25+
async def example_basic_trigger():
26+
"""Trigger a scrape, wait, and fetch results manually."""
27+
28+
print("=" * 60)
29+
print("Example 1: Basic Trigger/Poll/Fetch")
30+
print("=" * 60)
31+
32+
async with BrightDataClient() as client:
33+
amazon = client.scrape.amazon
34+
35+
# Step 1: Trigger the scrape (returns immediately)
36+
print("\n🚀 Triggering Amazon product scrape...")
37+
job = await amazon.products_trigger_async(
38+
url="https://www.amazon.com/dp/B0CRMZHDG8"
39+
)
40+
print(f"✅ Job triggered: {job.snapshot_id}")
41+
42+
# Step 2: Check status manually
43+
print("\n🔍 Checking job status...")
44+
status = await job.status_async()
45+
print(f"Status: {status}")
46+
47+
# Step 3: Wait for completion (with custom timeout)
48+
print("\n⏳ Waiting for completion...")
49+
await job.wait_async(timeout=180, verbose=True)
50+
51+
# Step 4: Fetch results
52+
print("\n📥 Fetching results...")
53+
data = await job.fetch_async()
54+
print(f"✅ Got {len(data) if isinstance(data, list) else 1} records")
55+
56+
# Or use convenience method (wait + fetch + wrap in ScrapeResult)
57+
print("\n💡 Alternative: Use to_result_async()...")
58+
result = await job.to_result_async()
59+
print(f"Success: {result.success}")
60+
print(f"Cost: ${result.cost:.4f}")
61+
62+
63+
# ============================================================================
64+
# Example 2: Concurrent Scraping (Trigger Multiple, Fetch Later)
65+
# ============================================================================
66+
67+
async def example_concurrent_scraping():
68+
"""Trigger multiple scrapes concurrently, then fetch all."""
69+
70+
print("\n\n" + "=" * 60)
71+
print("Example 2: Concurrent Scraping")
72+
print("=" * 60)
73+
74+
async with BrightDataClient() as client:
75+
amazon = client.scrape.amazon
76+
77+
# URLs to scrape
78+
urls = [
79+
"https://www.amazon.com/dp/B0CRMZHDG8",
80+
"https://www.amazon.com/dp/B09B9C8K3T",
81+
"https://www.amazon.com/dp/B0CX23V2ZK",
82+
]
83+
84+
# Step 1: Trigger all scrapes (non-blocking)
85+
print("\n🚀 Triggering multiple scrapes...")
86+
jobs = []
87+
for i, url in enumerate(urls, 1):
88+
job = await amazon.products_trigger_async(url=url)
89+
jobs.append(job)
90+
print(f" [{i}/{len(urls)}] Triggered: {job.snapshot_id[:12]}...")
91+
92+
print(f"\n✅ All {len(jobs)} jobs triggered!")
93+
94+
# Step 2: Wait for all to complete
95+
print("\n⏳ Waiting for all jobs to complete...")
96+
results = []
97+
for i, job in enumerate(jobs, 1):
98+
print(f" [{i}/{len(jobs)}] Waiting for job {job.snapshot_id[:12]}...")
99+
result = await job.to_result_async(timeout=180)
100+
results.append(result)
101+
102+
# Step 3: Process all results
103+
print("\n📊 Results summary:")
104+
total_cost = sum(r.cost or 0 for r in results)
105+
successful = sum(1 for r in results if r.success)
106+
print(f" - Successful: {successful}/{len(results)}")
107+
print(f" - Total cost: ${total_cost:.4f}")
108+
print(f" - Avg time: {sum(r.elapsed_ms() or 0 for r in results) / len(results):.0f}ms")
109+
110+
111+
# ============================================================================
112+
# Example 3: Custom Polling Logic
113+
# ============================================================================
114+
115+
async def example_custom_polling():
116+
"""Implement custom polling logic with your own intervals."""
117+
118+
print("\n\n" + "=" * 60)
119+
print("Example 3: Custom Polling Logic")
120+
print("=" * 60)
121+
122+
async with BrightDataClient() as client:
123+
amazon = client.scrape.amazon
124+
125+
# Trigger the scrape
126+
print("\n🚀 Triggering scrape...")
127+
job = await amazon.products_trigger_async(
128+
url="https://www.amazon.com/dp/B0CRMZHDG8"
129+
)
130+
print(f"✅ Job ID: {job.snapshot_id}")
131+
132+
# Custom polling with exponential backoff
133+
print("\n⏳ Custom polling with exponential backoff...")
134+
poll_interval = 2 # Start with 2 seconds
135+
max_interval = 20 # Max 20 seconds
136+
max_attempts = 30
137+
138+
for attempt in range(max_attempts):
139+
status = await job.status_async()
140+
elapsed = time.time() - job.triggered_at.timestamp()
141+
142+
print(f" [{elapsed:.1f}s] Attempt {attempt + 1}: {status}")
143+
144+
if status == "ready":
145+
print("✅ Job completed!")
146+
data = await job.fetch_async()
147+
print(f"📥 Got {len(data) if isinstance(data, list) else 1} records")
148+
break
149+
elif status == "error":
150+
print("❌ Job failed")
151+
break
152+
153+
# Wait with exponential backoff
154+
await asyncio.sleep(poll_interval)
155+
poll_interval = min(poll_interval * 1.5, max_interval)
156+
else:
157+
print("⏰ Timeout reached")
158+
159+
160+
# ============================================================================
161+
# Example 4: Save Job ID for Later Retrieval
162+
# ============================================================================
163+
164+
async def example_save_and_resume():
165+
"""Trigger a job, save the ID, and retrieve it later."""
166+
167+
print("\n\n" + "=" * 60)
168+
print("Example 4: Save Job ID & Resume Later")
169+
print("=" * 60)
170+
171+
async with BrightDataClient() as client:
172+
amazon = client.scrape.amazon
173+
174+
# Phase 1: Trigger and save job ID
175+
print("\n📝 Phase 1: Trigger and save job ID...")
176+
job = await amazon.products_trigger_async(
177+
url="https://www.amazon.com/dp/B0CRMZHDG8"
178+
)
179+
snapshot_id = job.snapshot_id
180+
print(f"✅ Job triggered: {snapshot_id}")
181+
print(f"💾 Saved snapshot_id for later: {snapshot_id}")
182+
183+
# Simulate doing other work...
184+
print("\n💤 Simulating other work (5 seconds)...")
185+
await asyncio.sleep(5)
186+
187+
# Phase 2: Resume with saved snapshot_id
188+
print("\n🔄 Phase 2: Resume with saved snapshot_id...")
189+
print(f"📂 Loading snapshot_id: {snapshot_id}")
190+
191+
# Check status using the snapshot_id directly
192+
status = await amazon.products_status_async(snapshot_id)
193+
print(f"Status: {status}")
194+
195+
# Fetch if ready
196+
if status == "ready":
197+
data = await amazon.products_fetch_async(snapshot_id)
198+
print(f"✅ Fetched {len(data) if isinstance(data, list) else 1} records")
199+
else:
200+
print("⏳ Job not ready yet, would need to wait longer...")
201+
202+
203+
# ============================================================================
204+
# Example 5: Sync Usage (for non-async code)
205+
# ============================================================================
206+
207+
def example_sync_usage():
208+
"""Use trigger interface in synchronous code."""
209+
210+
print("\n\n" + "=" * 60)
211+
print("Example 5: Sync Usage")
212+
print("=" * 60)
213+
214+
client = BrightDataClient()
215+
amazon = client.scrape.amazon
216+
217+
# Trigger (sync)
218+
print("\n🚀 Triggering scrape (sync)...")
219+
job = amazon.products_trigger(url="https://www.amazon.com/dp/B0CRMZHDG8")
220+
print(f"✅ Job ID: {job.snapshot_id}")
221+
222+
# Check status (sync)
223+
print("\n🔍 Checking status (sync)...")
224+
status = job.status()
225+
print(f"Status: {status}")
226+
227+
# Wait and fetch (sync)
228+
print("\n⏳ Waiting for completion (sync)...")
229+
result = job.to_result(timeout=180)
230+
print(f"Success: {result.success}")
231+
print(f"Cost: ${result.cost:.4f}")
232+
233+
234+
# ============================================================================
235+
# Run All Examples
236+
# ============================================================================
237+
238+
if __name__ == "__main__":
239+
print("\n🚀 Trigger Interface Examples\n")
240+
241+
# Run async examples
242+
asyncio.run(example_basic_trigger())
243+
asyncio.run(example_concurrent_scraping())
244+
asyncio.run(example_custom_polling())
245+
asyncio.run(example_save_and_resume())
246+
247+
# Run sync example
248+
example_sync_usage()
249+
250+
print("\n" + "=" * 60)
251+
print("✅ All examples completed!")
252+
print("=" * 60)
253+

src/brightdata/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@
1414
Result,
1515
)
1616

17+
# Export job model for manual trigger/poll/fetch
18+
from .scrapers.job import ScrapeJob
19+
1720
# Export payload models (dataclasses)
1821
from .payloads import (
1922
# Base
@@ -75,6 +78,8 @@
7578
"SearchResult",
7679
"CrawlResult",
7780
"Result",
81+
# Job model for manual control
82+
"ScrapeJob",
7883
# Payload models (dataclasses)
7984
"BasePayload",
8085
"URLPayload",

src/brightdata/scrapers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from .base import BaseWebScraper
44
from .registry import register, get_scraper_for, get_registered_platforms, is_platform_supported
5+
from .job import ScrapeJob
56

67
# Import scrapers to trigger registration
78
try:
@@ -37,6 +38,7 @@
3738

3839
__all__ = [
3940
"BaseWebScraper",
41+
"ScrapeJob",
4042
"register",
4143
"get_scraper_for",
4244
"get_registered_platforms",

0 commit comments

Comments
 (0)