Skip to content

Commit f6f4ad8

Browse files
committed
update similarweb sitemap scraper and add missing test case
1 parent 888ba7e commit f6f4ad8

File tree

4 files changed

+20
-7
lines changed

4 files changed

+20
-7
lines changed

.github/workflows/test_scrapers.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ jobs:
9292
test: test_website_compare_scraping
9393
- project_dir: similarweb-scraper
9494
test: test_trend_scraping
95+
- project_dir: similarweb-scraper
96+
test: test_sitemap_scraping
9597
- project_dir: zillow-scraper
9698
test: test_search_scraping
9799
- project_dir: zillow-scraper

similarweb-scraper/run.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ async def run():
3434
json.dump(comparing_data, file, indent=2, ensure_ascii=False)
3535

3636
sitemap_data = await similarweb.scrape_sitemaps(
37-
url="https://www.similarweb.com/sitemaps/top-websites-trending/part-00000.gz"
37+
url="https://www.similarweb.com/sitemaps/top-websites/top-websites-001.xml.gz"
3838
)
3939
with open(output.joinpath("sitemap_urls.json"), "w", encoding="utf-8") as file:
4040
json.dump(sitemap_data, file, indent=2, ensure_ascii=False)

similarweb-scraper/similarweb.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import gzip
1111
import json
1212
import jmespath
13+
import base64
1314
from parsel import Selector
1415
from typing import Dict, List, Optional
1516
from loguru import logger as log
@@ -85,9 +86,10 @@ async def scrape_website_compare(first_domain: str, second_domain: str) -> Dict:
8586

8687
def parse_sitemaps(response: ScrapeApiResponse) -> List[str]:
8788
"""parse links for bestbuy sitemap"""
88-
bytes_data = response.scrape_result['content'].getvalue()
89-
# decode the .gz file
90-
xml = bytes_data.decode('utf-8')
89+
content = response.scrape_result['content']
90+
# decode base64 content first
91+
decoded_bytes = base64.b64decode(content)
92+
xml = decoded_bytes.decode('utf-8')
9193
selector = Selector(xml)
9294
data = []
9395
for url in selector.xpath("//url/loc/text()"):
@@ -99,11 +101,11 @@ async def scrape_sitemaps(url: str) -> List[str]:
99101
"""scrape link data from bestbuy sitemap"""
100102
promo_urls = None
101103
try:
102-
response = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
104+
response = await SCRAPFLY.async_scrape(ScrapeConfig(url, proxy_pool="public_residential_pool", country="us" ))
103105
promo_urls = parse_sitemaps(response)
104106
log.success(f"scraped {len(promo_urls)} urls from sitemaps")
105-
except:
106-
log.info("couldnt' scrape sitemaps, request was blocked")
107+
except Exception as e:
108+
log.info(f"couldnt' scrape sitemaps, request was blocked: {e}")
107109
pass
108110
return promo_urls
109111

similarweb-scraper/test.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,12 @@ async def test_trend_scraping():
141141
for item in trending_data:
142142
validate_or_fail(item, validator)
143143
assert len(trending_data) == 3
144+
145+
146+
@pytest.mark.asyncio
147+
@pytest.mark.flaky(reruns=3, reruns_delay=30)
148+
async def test_sitemap_scraping():
149+
sitemap_urls = await similarweb.scrape_sitemaps(
150+
url="https://www.similarweb.com/sitemaps/top-websites/top-websites-001.xml.gz"
151+
)
152+
assert len(sitemap_urls) > 100

0 commit comments

Comments
 (0)