Skip to content

Commit 3ba9795

Browse files
authored
Merge pull request #55 from scrapfly/fix-scrapers-22
Fix: etsy, immowelt
2 parents dfcc52b + 2c3c90d commit 3ba9795

File tree

6 files changed

+4418
-2779
lines changed

6 files changed

+4418
-2779
lines changed

etsy-scraper/etsy.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,9 @@ def parse_product_page(response: ScrapeApiResponse) -> Dict:
8282
"""parse hidden product data from product pages"""
8383
selector = response.selector
8484
script = selector.xpath("//script[contains(text(),'offers')]/text()").get()
85+
if not script:
86+
log.warning(f"Could not find product data script on {response.context['url']}")
87+
return {}
8588
data = json.loads(script)
8689
return data
8790

@@ -90,9 +93,15 @@ def parse_shop_page(response: ScrapeApiResponse) -> Dict:
9093
"""parse hidden shop data from shop pages"""
9194
selector = response.selector
9295
script = selector.xpath("//script[contains(text(),'itemListElement')]/text()").get()
93-
data = json.loads(script)
94-
return data
95-
96+
if not script:
97+
log.warning(f"Could not find shop data script on {response.context['url']}")
98+
return {}
99+
try:
100+
data = json.loads(script)
101+
return data
102+
except json.JSONDecodeError as e:
103+
log.error(f"Failed to parse JSON from shop page {response.context['url']}: {e}")
104+
return {}
96105

97106
async def scrape_search(url: str, max_pages: int = None) -> List[Dict]:
98107
"""scrape product listing data from Etsy search pages"""
@@ -159,7 +168,7 @@ async def scrape_shop(urls: List[str]) -> List[Dict]:
159168
# scrape all the shop pages concurrently
160169
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
161170
data = parse_shop_page(response)
162-
data['url'] = response.context['url']
171+
data["url"] = response.context["url"]
163172
shops.append(data)
164173
log.success(f"scraped {len(shops)} shops from shop pages")
165174
return shops

0 commit comments

Comments
 (0)