scrapfly
diff --git a/‎g2-scraper/g2.py‎
Lines changed: 2 additions & 58 deletions b/‎g2-scraper/g2.py‎
Lines changed: 2 additions & 58 deletions
diff --git a/‎g2-scraper/pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎g2-scraper/pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎g2-scraper/results/alternatives.json‎
Lines changed: 6 additions & 6 deletions b/‎g2-scraper/results/alternatives.json‎
Lines changed: 6 additions & 6 deletions
@@ -20,6 +20,8 @@
     "asp": True,
     # set the poxy location to US
     "country": "US",
+    "render_js"  : True,
+    "proxy_pool" : "public_residential_pool"
 }
 
 
@@ -104,28 +106,6 @@ async def scrape_search(url: str, max_scrape_pages: int = None) -> List[Dict]:
             log.error(f"Error encountered: {e}")
             continue
 
-    # try again with the blocked requests if any using headless browsers and residential proxies
-    if len(remaining_urls) != 0:
-        log.debug(
-            f"{len(remaining_urls)} requests are blocked, trying again with render_js enabled and residential proxies"
-        )
-        try:
-            failed_requests = [
-                ScrapeConfig(
-                    url,
-                    **BASE_CONFIG,
-                    render_js=True,
-                    proxy_pool="public_residential_pool",
-                )
-                for url in remaining_urls
-            ]
-            async for response in SCRAPFLY.concurrent_scrape(failed_requests):
-                data = parse_search_page(response)
-                search_data.extend(data["search_data"])
-        except Exception as e:  # catching any exception
-            log.error(f"Error encountered: {e}")
-            pass
-    log.success(f"scraped {len(search_data)} company listings from G2 search pages with the URL {url}")
     return search_data
 
 
@@ -209,7 +189,6 @@ async def scrape_reviews(url: str, max_review_pages: int = None) -> List[Dict]:
         **BASE_CONFIG,
         "debug": True,
         "auto_scroll": True,
-        "render_js": True,
         "wait_for_selector": "//section[@id='reviews']//article",
     }
     first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **enhanced_config))
@@ -234,26 +213,6 @@ async def scrape_reviews(url: str, max_review_pages: int = None) -> List[Dict]:
             log.error(f"Error encountered: {e}")
             continue
 
-    if len(remaining_urls) != 0:
-        log.debug(
-            f"{len(remaining_urls)} requests are blocked, trying again with render_js enabled and residential proxies"
-        )
-        try:
-            failed_requests = [
-                ScrapeConfig(
-                    url,
-                    **BASE_CONFIG,
-                    render_js=True,
-                    proxy_pool="public_residential_pool",
-                )
-                for url in remaining_urls
-            ]
-            async for response in SCRAPFLY.concurrent_scrape(failed_requests):
-                data = parse_search_page(response)
-                reviews_data.extend(data["reviews_data"])
-        except Exception as e:  # catch any exception
-            log.error(f"Error encountered: {e}")
-            pass
     log.success(f"scraped {len(reviews_data)} company reviews from G2 review pages with the URL {url}")
     return reviews_data
 
@@ -330,25 +289,10 @@ async def scrape_alternatives(
     """scrape product alternatives from G2 alternative pages"""
     # the default alternative is top 10, which takes to argument
     url = f"https://www.g2.com/products/{product}/competitors/alternatives/{alternatives}"
-    log.info(f"Scraping alternative page {url} (attempt 1: no JS)")
-
     data = []
     try:
-        # 1. First, try the cheap and fast request without JavaScript
         response = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
         data = parse_alternatives(response)
-
-        # 2. Check if the first attempt failed to get descriptions
-        # This checks if we got data, but the description field in all items is empty
-        descriptions_missing = data and not any(item.get("description") for item in data)
-
-        if descriptions_missing:
-            log.warning("Descriptions missing. Retrying with JavaScript rendering (attempt 2).")
-            # 3. If descriptions are missing, retry with render_js=True
-            js_config = ScrapeConfig(url, **BASE_CONFIG, render_js=True)
-            response = await SCRAPFLY.async_scrape(js_config)
-            data = parse_alternatives(response)
-
     except Exception as e:
         log.error(f"An exception occurred during scraping: {e}")
 
 
@@ -10,6 +10,7 @@ readme = "README.md"
 python = "^3.10"
 scrapfly-sdk = {extras = ["all"], version = "^0.8.5"}
 loguru = "^0.7.1"
+pytest-rerunfailures = "^14.0"
 
 [tool.poetry.group.dev.dependencies]
 black = "^23.7.0"
 
@@ -35,47 +35,47 @@
     "name": "Amazon EC2",
     "link": "https://www.g2.com/products/amazon-ec2/reviews",
     "ranking": 5,
-    "numberOfReviews": 1333,
+    "numberOfReviews": 1245,
     "rate": 4.6,
     "description": "AWS Elastic Compute Cloud (EC2) is a web service that provides resizable compute capacity in the cloud, making web-scale computing easier for developers."
   },
   {
     "name": "AWS Lambda",
     "link": "https://www.g2.com/products/aws-lambda/reviews",
     "ranking": 6,
-    "numberOfReviews": 1124,
+    "numberOfReviews": 1020,
     "rate": 4.6,
     "description": "Run code without thinking about servers. Pay for only the compute time you consume."
   },
   {
     "name": "Amazon Relational Database Service (RDS)",
     "link": "https://www.g2.com/products/amazon-relational-database-service-rds/reviews",
     "ranking": 7,
-    "numberOfReviews": 1060,
+    "numberOfReviews": 966,
     "rate": 4.5,
     "description": "Amazon Relational Database Service (RDS) is a web service that makes it easy to set up, operate, and scale a relational DB in the cloud: Amazon Aurora, PostgreSQL, MySQL, MariaDB, Oracle, and Microsoft SQL Server."
   },
   {
     "name": "Google Compute Engine",
     "link": "https://www.g2.com/products/google-compute-engine/reviews",
     "ranking": 8,
-    "numberOfReviews": 981,
+    "numberOfReviews": 946,
     "rate": 4.5,
     "description": "Compute Engine enables you to create and run large-scale workloads on virtual machines hosted on Google Cloud. Get running quickly with pre-built and ready-to-go configurations or create machines of your own with the optimal amount of vCPU and memory required for your workload."
   },
   {
     "name": "Hostinger",
     "link": "https://www.g2.com/products/hostinger/reviews",
     "ranking": 9,
-    "numberOfReviews": 751,
+    "numberOfReviews": 755,
     "rate": 4.4,
     "description": "Hostinger provides every customer with all the necessary tools to have a fully-functional website up and running as quickly as possible. Hostinger  provides an incredibly convenient drag & drop website builder and application installer."
   },
   {
     "name": "Azure Virtual Machines",
     "link": "https://www.g2.com/products/azure-virtual-machines/reviews",
     "ranking": 10,
-    "numberOfReviews": 474,
+    "numberOfReviews": 420,
     "rate": 4.4,
     "description": "Azure Virtual Machines gives you the flexibility of virtualization for a wide range of computing solutions: development and testing, running applications, and extending your datacenter with support for Linux, Windows Server, SQL Server, Oracle, IBM, and SAP."
   }