Skip to content

Commit 9f36969

Browse files
committed
Major Fixes in how we retrieve data from API through sdk's providers
1 parent 8a206c4 commit 9f36969

28 files changed

+3714
-340
lines changed

lastcheck.md

Lines changed: 400 additions & 0 deletions
Large diffs are not rendered by default.

src/brightdata/scrapers/amazon/scraper.py

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,9 @@ class AmazonScraper(BaseWebScraper):
4343
"""
4444

4545
# Amazon dataset IDs
46-
DATASET_ID = "gd_l7q7dkf244hwxbl93" # Amazon Products
47-
DATASET_ID_REVIEWS = "gd_l1vq6tkpl34p7mq7c" # Amazon Reviews
48-
DATASET_ID_SELLERS = "gd_lwjkkolem8c4o7j3s" # Amazon Sellers
46+
DATASET_ID = "gd_l7q7dkf244hwjntr0" # Amazon Products
47+
DATASET_ID_REVIEWS = "gd_le8e811kzy4ggddlq" # Amazon Reviews
48+
DATASET_ID_SELLERS = "gd_lhotzucw1etoe5iw1k" # Amazon Sellers
4949

5050
PLATFORM_NAME = "amazon"
5151
MIN_POLL_TIMEOUT = DEFAULT_TIMEOUT_MEDIUM # Amazon scrapes can take longer
@@ -150,21 +150,10 @@ async def reviews_async(
150150
else:
151151
validate_url_list(url)
152152

153-
# Build custom payload with review filters
153+
# Build payload - Amazon Reviews dataset only accepts URL
154+
# Note: pastDays, keyWord, numOfReviews are not supported by the API
154155
url_list = [url] if isinstance(url, str) else url
155-
payload = []
156-
157-
for u in url_list:
158-
item: Dict[str, Any] = {"url": u}
159-
160-
if pastDays is not None:
161-
item["pastDays"] = pastDays
162-
if keyWord is not None:
163-
item["keyWord"] = keyWord
164-
if numOfReviews is not None:
165-
item["numOfReviews"] = numOfReviews
166-
167-
payload.append(item)
156+
payload = [{"url": u} for u in url_list]
168157

169158
# Use reviews dataset with standard async workflow
170159
is_single = isinstance(url, str)

src/brightdata/scrapers/api_client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,9 @@ async def trigger(
6565
"dataset_id": dataset_id,
6666
"include_errors": str(include_errors).lower(),
6767
}
68-
68+
6969
if sdk_function:
70-
payload = [{**item, "sdk_function": sdk_function} for item in payload]
70+
params["sdk_function"] = sdk_function
7171

7272
async with self.engine.post_to_url(
7373
self.TRIGGER_URL,

src/brightdata/scrapers/chatgpt/scraper.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,14 @@ async def prompt_async(
8181
if not prompt or not isinstance(prompt, str):
8282
raise ValidationError("Prompt must be a non-empty string")
8383

84-
# Build payload
84+
# Build payload - ChatGPT scraper requires url field pointing to ChatGPT
8585
payload = [{
86+
"url": "https://chatgpt.com/",
8687
"prompt": prompt,
8788
"country": country.upper(),
8889
"web_search": web_search,
8990
}]
90-
91+
9192
if additional_prompt:
9293
payload[0]["additional_prompt"] = additional_prompt
9394

@@ -158,18 +159,19 @@ async def prompts_async(
158159
if not prompts or not isinstance(prompts, list):
159160
raise ValidationError("Prompts must be a non-empty list")
160161

161-
# Build batch payload
162+
# Build batch payload - ChatGPT scraper requires url field
162163
payload = []
163164
for i, prompt in enumerate(prompts):
164165
item = {
166+
"url": "https://chatgpt.com/",
165167
"prompt": prompt,
166168
"country": countries[i].upper() if countries and i < len(countries) else "US",
167169
"web_search": web_searches[i] if web_searches and i < len(web_searches) else False,
168170
}
169-
171+
170172
if additional_prompts and i < len(additional_prompts):
171173
item["additional_prompt"] = additional_prompts[i]
172-
174+
173175
payload.append(item)
174176

175177
# Execute workflow

src/brightdata/scrapers/instagram/search.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ async def posts_async(
115115
end_date=end_date,
116116
post_type=post_type,
117117
timeout=timeout,
118-
sdk_function="posts",
119118
)
120119

121120
def posts(
@@ -216,6 +215,7 @@ async def _discover_with_params(
216215
end_date: Optional[str] = None,
217216
post_type: Optional[str] = None,
218217
timeout: int = DEFAULT_TIMEOUT_MEDIUM,
218+
sdk_function: Optional[str] = None,
219219
) -> Union[ScrapeResult, List[ScrapeResult]]:
220220
"""
221221
Discover content with additional parameters using standard async workflow.

src/brightdata/scrapers/linkedin/scraper.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ class LinkedInScraper(BaseWebScraper):
5353
"""
5454

5555
# LinkedIn dataset IDs
56-
DATASET_ID = "gd_l1oojb10z2jye29kh" # People Profiles
57-
DATASET_ID_COMPANIES = "gd_lhkq90okie75oj8mo" # Companies
58-
DATASET_ID_JOBS = "gd_lj4v2v5oqpp3qb79j" # Jobs
59-
DATASET_ID_POSTS = "gd_lwae11111pwxp6c4ea" # Posts
56+
DATASET_ID = "gd_l1viktl72bvl7bjuj0" # People Profiles
57+
DATASET_ID_COMPANIES = "gd_l1vikfnt1wgvvqz95w" # Companies
58+
DATASET_ID_JOBS = "gd_lpfll7v5hcqtkxl6l" # Jobs
59+
DATASET_ID_POSTS = "gd_lyy3tktm25m4avu764" # Posts
6060

6161
PLATFORM_NAME = "linkedin"
6262
MIN_POLL_TIMEOUT = DEFAULT_TIMEOUT_SHORT

src/brightdata/scrapers/linkedin/search.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,10 @@ class LinkedInSearchScraper:
3939
"""
4040

4141
# Dataset IDs for different LinkedIn types
42-
DATASET_ID_POSTS = "gd_lwae11111pwxp6c4ea"
43-
DATASET_ID_PROFILES = "gd_l1oojb10z2jye29kh"
44-
DATASET_ID_JOBS = "gd_lj4v2v5oqpp3qb79j"
42+
DATASET_ID_POSTS = "gd_lyy3tktm25m4avu764"
43+
DATASET_ID_PROFILES = "gd_l1viktl72bvl7bjuj0"
44+
DATASET_ID_JOBS = "gd_lpfll7v5hcqtkxl6l" # URL-based job scraping
45+
DATASET_ID_JOBS_DISCOVERY = "gd_m487ihp32jtc4ujg45" # Keyword/location discovery
4546

4647
def __init__(self, bearer_token: str, engine: Optional[AsyncEngine] = None):
4748
"""
@@ -288,10 +289,13 @@ async def jobs_async(
288289
item["locationRadius"] = location_radii[i]
289290

290291
payload.append(item)
291-
292+
293+
# Use discovery dataset if searching by keyword/location, otherwise URL-based
294+
dataset_id = self.DATASET_ID_JOBS_DISCOVERY if (keyword or location) else self.DATASET_ID_JOBS
295+
292296
return await self._execute_search(
293297
payload=payload,
294-
dataset_id=self.DATASET_ID_JOBS,
298+
dataset_id=dataset_id,
295299
timeout=timeout
296300
)
297301

tests/enes/amazon.py

Lines changed: 66 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -24,39 +24,43 @@ async def test_amazon_products():
2424
client = BrightDataClient()
2525

2626
async with client.engine:
27-
print("\n🛒 Testing Amazon product scraping...")
28-
print("📍 Product URL: https://www.amazon.com/dp/B0CRMZHDG8")
27+
scraper = client.scrape.amazon
28+
async with scraper.engine:
29+
print("\n🛒 Testing Amazon product scraping...")
30+
print("📍 Product URL: https://www.amazon.com/dp/B0CRMZHDG8")
2931

30-
try:
31-
result = await client.scrape.amazon.products_async(
32+
try:
33+
result = await scraper.products_async(
3234
url="https://www.amazon.com/dp/B0CRMZHDG8",
3335
timeout=240
3436
)
3537

36-
print(f"\n✅ API call succeeded")
37-
print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "")
38-
39-
print(f"\n📊 Result analysis:")
40-
print(f" - result.success: {result.success}")
41-
print(f" - result.data type: {type(result.data)}")
42-
43-
if result.data:
44-
print(f"\n✅ Got product data:")
45-
if isinstance(result.data, dict):
46-
print(f" - Title: {result.data.get('title', 'N/A')}")
47-
print(f" - Price: {result.data.get('price', 'N/A')}")
48-
print(f" - ASIN: {result.data.get('asin', 'N/A')}")
49-
print(f" - Rating: {result.data.get('rating', 'N/A')}")
50-
print(f" - Review Count: {result.data.get('reviews_count', 'N/A')}")
38+
print(f"\n✅ API call succeeded")
39+
print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "")
40+
41+
print(f"\n📊 Result analysis:")
42+
print(f" - result.success: {result.success}")
43+
print(f" - result.data type: {type(result.data)}")
44+
print(f" - result.status: {result.status if hasattr(result, 'status') else 'N/A'}")
45+
print(f" - result.error: {result.error if hasattr(result, 'error') else 'N/A'}")
46+
47+
if result.data:
48+
print(f"\n✅ Got product data:")
49+
if isinstance(result.data, dict):
50+
print(f" - Title: {result.data.get('title', 'N/A')}")
51+
print(f" - Price: {result.data.get('price', 'N/A')}")
52+
print(f" - ASIN: {result.data.get('asin', 'N/A')}")
53+
print(f" - Rating: {result.data.get('rating', 'N/A')}")
54+
print(f" - Review Count: {result.data.get('reviews_count', 'N/A')}")
55+
else:
56+
print(f" Data: {result.data}")
5157
else:
52-
print(f" Data: {result.data}")
53-
else:
54-
print(f"\n❌ No product data returned")
58+
print(f"\n❌ No product data returned")
5559

56-
except Exception as e:
57-
print(f"\n❌ Error: {e}")
58-
import traceback
59-
traceback.print_exc()
60+
except Exception as e:
61+
print(f"\n❌ Error: {e}")
62+
import traceback
63+
traceback.print_exc()
6064

6165

6266
async def test_amazon_reviews():
@@ -69,45 +73,49 @@ async def test_amazon_reviews():
6973
client = BrightDataClient()
7074

7175
async with client.engine:
72-
print("\n📝 Testing Amazon reviews scraping...")
73-
print("📍 Product URL: https://www.amazon.com/dp/B0CRMZHDG8")
74-
print("📋 Parameters: pastDays=30, numOfReviews=10")
75-
76-
try:
77-
result = await client.scrape.amazon.reviews_async(
76+
scraper = client.scrape.amazon
77+
async with scraper.engine:
78+
print("\n📝 Testing Amazon reviews scraping...")
79+
print("📍 Product URL: https://www.amazon.com/dp/B0CRMZHDG8")
80+
print("📋 Parameters: pastDays=30, numOfReviews=10")
81+
82+
try:
83+
result = await scraper.reviews_async(
7884
url="https://www.amazon.com/dp/B0CRMZHDG8",
7985
pastDays=30,
8086
numOfReviews=10,
8187
timeout=240
8288
)
8389

84-
print(f"\n✅ API call succeeded")
85-
print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "")
86-
87-
print(f"\n📊 Result analysis:")
88-
print(f" - result.success: {result.success}")
89-
print(f" - result.data type: {type(result.data)}")
90-
91-
if result.data:
92-
if isinstance(result.data, list):
93-
print(f"\n✅ Got {len(result.data)} reviews:")
94-
for i, review in enumerate(result.data[:3], 1):
95-
print(f"\n Review {i}:")
96-
print(f" - Rating: {review.get('rating', 'N/A')}")
97-
print(f" - Title: {review.get('title', 'N/A')[:60]}...")
98-
print(f" - Author: {review.get('author', 'N/A')}")
99-
elif isinstance(result.data, dict):
100-
reviews = result.data.get('reviews', [])
101-
print(f"\n✅ Got {len(reviews)} reviews")
90+
print(f"\n✅ API call succeeded")
91+
print(f"⏱️ Elapsed: {result.elapsed_ms():.2f}ms" if result.elapsed_ms() else "")
92+
93+
print(f"\n📊 Result analysis:")
94+
print(f" - result.success: {result.success}")
95+
print(f" - result.data type: {type(result.data)}")
96+
print(f" - result.status: {result.status if hasattr(result, 'status') else 'N/A'}")
97+
print(f" - result.error: {result.error if hasattr(result, 'error') else 'N/A'}")
98+
99+
if result.data:
100+
if isinstance(result.data, list):
101+
print(f"\n✅ Got {len(result.data)} reviews:")
102+
for i, review in enumerate(result.data[:3], 1):
103+
print(f"\n Review {i}:")
104+
print(f" - Rating: {review.get('rating', 'N/A')}")
105+
print(f" - Title: {review.get('title', 'N/A')[:60]}...")
106+
print(f" - Author: {review.get('author', 'N/A')}")
107+
elif isinstance(result.data, dict):
108+
reviews = result.data.get('reviews', [])
109+
print(f"\n✅ Got {len(reviews)} reviews")
110+
else:
111+
print(f" Data: {result.data}")
102112
else:
103-
print(f" Data: {result.data}")
104-
else:
105-
print(f"\n❌ No reviews data returned")
106-
107-
except Exception as e:
108-
print(f"\n❌ Error: {e}")
109-
import traceback
110-
traceback.print_exc()
113+
print(f"\n❌ No reviews data returned")
114+
115+
except Exception as e:
116+
print(f"\n❌ Error: {e}")
117+
import traceback
118+
traceback.print_exc()
111119

112120

113121
if __name__ == "__main__":

0 commit comments

Comments
 (0)