update similarweb sitemap scraper and add missing test case

mazen-r · mazen-r · commit f6f4ad8e4c91 · 2025-09-25T00:55:37.000+03:00
diff --git a/.github/workflows/test_scrapers.yaml b/.github/workflows/test_scrapers.yaml
@@ -92,6 +92,8 @@ jobs:
             test: test_website_compare_scraping
           - project_dir: similarweb-scraper
             test: test_trend_scraping
+          - project_dir: similarweb-scraper
+            test: test_sitemap_scraping
           - project_dir: zillow-scraper
             test: test_search_scraping
           - project_dir: zillow-scraper
diff --git a/similarweb-scraper/run.py b/similarweb-scraper/run.py
@@ -34,7 +34,7 @@ async def run():
         json.dump(comparing_data, file, indent=2, ensure_ascii=False)    
 
     sitemap_data = await similarweb.scrape_sitemaps(
-        url="https://www.similarweb.com/sitemaps/top-websites-trending/part-00000.gz"
+        url="https://www.similarweb.com/sitemaps/top-websites/top-websites-001.xml.gz"
     )
     with open(output.joinpath("sitemap_urls.json"), "w", encoding="utf-8") as file:
         json.dump(sitemap_data, file, indent=2, ensure_ascii=False)
diff --git a/similarweb-scraper/similarweb.py b/similarweb-scraper/similarweb.py
@@ -10,6 +10,7 @@
 import gzip
 import json
 import jmespath
+import base64
 from parsel import Selector
 from typing import Dict, List, Optional
 from loguru import logger as log
@@ -85,9 +86,10 @@ async def scrape_website_compare(first_domain: str, second_domain: str) -> Dict:
 
 def parse_sitemaps(response: ScrapeApiResponse) -> List[str]:
     """parse links for bestbuy sitemap"""
-    bytes_data = response.scrape_result['content'].getvalue()
-    # decode the .gz file
-    xml = bytes_data.decode('utf-8')
+    content = response.scrape_result['content']
+    # decode base64 content first
+    decoded_bytes = base64.b64decode(content)
+    xml = decoded_bytes.decode('utf-8')
     selector = Selector(xml)
     data = []
     for url in selector.xpath("//url/loc/text()"):
@@ -99,11 +101,11 @@ async def scrape_sitemaps(url: str) -> List[str]:
     """scrape link data from bestbuy sitemap"""
     promo_urls = None
     try:
-        response = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
+        response = await SCRAPFLY.async_scrape(ScrapeConfig(url, proxy_pool="public_residential_pool", country="us" ))
         promo_urls = parse_sitemaps(response)
         log.success(f"scraped {len(promo_urls)} urls from sitemaps")
-    except:
-        log.info("couldnt' scrape sitemaps, request was blocked")
+    except Exception as e:
+        log.info(f"couldnt' scrape sitemaps, request was blocked: {e}")
         pass
     return promo_urls
 
diff --git a/similarweb-scraper/test.py b/similarweb-scraper/test.py
@@ -141,3 +141,12 @@ async def test_trend_scraping():
     for item in trending_data:
         validate_or_fail(item, validator)
     assert len(trending_data) == 3
+
+
+@pytest.mark.asyncio
+@pytest.mark.flaky(reruns=3, reruns_delay=30)
+async def test_sitemap_scraping():
+    sitemap_urls = await similarweb.scrape_sitemaps(
+        url="https://www.similarweb.com/sitemaps/top-websites/top-websites-001.xml.gz"
+    )
+    assert len(sitemap_urls) > 100

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ async def run():`
`34`	`34`	`json.dump(comparing_data, file, indent=2, ensure_ascii=False)`
`35`	`35`
`36`	`36`	`sitemap_data = await similarweb.scrape_sitemaps(`
`37`		`- url="https://www.similarweb.com/sitemaps/top-websites-trending/part-00000.gz"`
	`37`	`+ url="https://www.similarweb.com/sitemaps/top-websites/top-websites-001.xml.gz"`
`38`	`38`	`)`
`39`	`39`	`with open(output.joinpath("sitemap_urls.json"), "w", encoding="utf-8") as file:`
`40`	`40`	`json.dump(sitemap_data, file, indent=2, ensure_ascii=False)`