Skip to content

Commit 619dc8f

Browse files
authored
Merge pull request #81 from scrapfly/fix-g2
Update g2 config
2 parents ea12f79 + e23928e commit 619dc8f

File tree

6 files changed

+78
-339
lines changed

6 files changed

+78
-339
lines changed

g2-scraper/g2.py

Lines changed: 2 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
"asp": True,
2121
# set the poxy location to US
2222
"country": "US",
23+
"render_js" : True,
24+
"proxy_pool" : "public_residential_pool"
2325
}
2426

2527

@@ -104,28 +106,6 @@ async def scrape_search(url: str, max_scrape_pages: int = None) -> List[Dict]:
104106
log.error(f"Error encountered: {e}")
105107
continue
106108

107-
# try again with the blocked requests if any using headless browsers and residential proxies
108-
if len(remaining_urls) != 0:
109-
log.debug(
110-
f"{len(remaining_urls)} requests are blocked, trying again with render_js enabled and residential proxies"
111-
)
112-
try:
113-
failed_requests = [
114-
ScrapeConfig(
115-
url,
116-
**BASE_CONFIG,
117-
render_js=True,
118-
proxy_pool="public_residential_pool",
119-
)
120-
for url in remaining_urls
121-
]
122-
async for response in SCRAPFLY.concurrent_scrape(failed_requests):
123-
data = parse_search_page(response)
124-
search_data.extend(data["search_data"])
125-
except Exception as e: # catching any exception
126-
log.error(f"Error encountered: {e}")
127-
pass
128-
log.success(f"scraped {len(search_data)} company listings from G2 search pages with the URL {url}")
129109
return search_data
130110

131111

@@ -209,7 +189,6 @@ async def scrape_reviews(url: str, max_review_pages: int = None) -> List[Dict]:
209189
**BASE_CONFIG,
210190
"debug": True,
211191
"auto_scroll": True,
212-
"render_js": True,
213192
"wait_for_selector": "//section[@id='reviews']//article",
214193
}
215194
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **enhanced_config))
@@ -234,26 +213,6 @@ async def scrape_reviews(url: str, max_review_pages: int = None) -> List[Dict]:
234213
log.error(f"Error encountered: {e}")
235214
continue
236215

237-
if len(remaining_urls) != 0:
238-
log.debug(
239-
f"{len(remaining_urls)} requests are blocked, trying again with render_js enabled and residential proxies"
240-
)
241-
try:
242-
failed_requests = [
243-
ScrapeConfig(
244-
url,
245-
**BASE_CONFIG,
246-
render_js=True,
247-
proxy_pool="public_residential_pool",
248-
)
249-
for url in remaining_urls
250-
]
251-
async for response in SCRAPFLY.concurrent_scrape(failed_requests):
252-
data = parse_search_page(response)
253-
reviews_data.extend(data["reviews_data"])
254-
except Exception as e: # catch any exception
255-
log.error(f"Error encountered: {e}")
256-
pass
257216
log.success(f"scraped {len(reviews_data)} company reviews from G2 review pages with the URL {url}")
258217
return reviews_data
259218

@@ -330,25 +289,10 @@ async def scrape_alternatives(
330289
"""scrape product alternatives from G2 alternative pages"""
331290
# the default alternative is top 10, which takes to argument
332291
url = f"https://www.g2.com/products/{product}/competitors/alternatives/{alternatives}"
333-
log.info(f"Scraping alternative page {url} (attempt 1: no JS)")
334-
335292
data = []
336293
try:
337-
# 1. First, try the cheap and fast request without JavaScript
338294
response = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
339295
data = parse_alternatives(response)
340-
341-
# 2. Check if the first attempt failed to get descriptions
342-
# This checks if we got data, but the description field in all items is empty
343-
descriptions_missing = data and not any(item.get("description") for item in data)
344-
345-
if descriptions_missing:
346-
log.warning("Descriptions missing. Retrying with JavaScript rendering (attempt 2).")
347-
# 3. If descriptions are missing, retry with render_js=True
348-
js_config = ScrapeConfig(url, **BASE_CONFIG, render_js=True)
349-
response = await SCRAPFLY.async_scrape(js_config)
350-
data = parse_alternatives(response)
351-
352296
except Exception as e:
353297
log.error(f"An exception occurred during scraping: {e}")
354298

g2-scraper/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ readme = "README.md"
1010
python = "^3.10"
1111
scrapfly-sdk = {extras = ["all"], version = "^0.8.5"}
1212
loguru = "^0.7.1"
13+
pytest-rerunfailures = "^14.0"
1314

1415
[tool.poetry.group.dev.dependencies]
1516
black = "^23.7.0"

g2-scraper/results/alternatives.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,47 +35,47 @@
3535
"name": "Amazon EC2",
3636
"link": "https://www.g2.com/products/amazon-ec2/reviews",
3737
"ranking": 5,
38-
"numberOfReviews": 1333,
38+
"numberOfReviews": 1245,
3939
"rate": 4.6,
4040
"description": "AWS Elastic Compute Cloud (EC2) is a web service that provides resizable compute capacity in the cloud, making web-scale computing easier for developers."
4141
},
4242
{
4343
"name": "AWS Lambda",
4444
"link": "https://www.g2.com/products/aws-lambda/reviews",
4545
"ranking": 6,
46-
"numberOfReviews": 1124,
46+
"numberOfReviews": 1020,
4747
"rate": 4.6,
4848
"description": "Run code without thinking about servers. Pay for only the compute time you consume."
4949
},
5050
{
5151
"name": "Amazon Relational Database Service (RDS)",
5252
"link": "https://www.g2.com/products/amazon-relational-database-service-rds/reviews",
5353
"ranking": 7,
54-
"numberOfReviews": 1060,
54+
"numberOfReviews": 966,
5555
"rate": 4.5,
5656
"description": "Amazon Relational Database Service (RDS) is a web service that makes it easy to set up, operate, and scale a relational DB in the cloud: Amazon Aurora, PostgreSQL, MySQL, MariaDB, Oracle, and Microsoft SQL Server."
5757
},
5858
{
5959
"name": "Google Compute Engine",
6060
"link": "https://www.g2.com/products/google-compute-engine/reviews",
6161
"ranking": 8,
62-
"numberOfReviews": 981,
62+
"numberOfReviews": 946,
6363
"rate": 4.5,
6464
"description": "Compute Engine enables you to create and run large-scale workloads on virtual machines hosted on Google Cloud. Get running quickly with pre-built and ready-to-go configurations or create machines of your own with the optimal amount of vCPU and memory required for your workload."
6565
},
6666
{
6767
"name": "Hostinger",
6868
"link": "https://www.g2.com/products/hostinger/reviews",
6969
"ranking": 9,
70-
"numberOfReviews": 751,
70+
"numberOfReviews": 755,
7171
"rate": 4.4,
7272
"description": "Hostinger provides every customer with all the necessary tools to have a fully-functional website up and running as quickly as possible. Hostinger provides an incredibly convenient drag & drop website builder and application installer."
7373
},
7474
{
7575
"name": "Azure Virtual Machines",
7676
"link": "https://www.g2.com/products/azure-virtual-machines/reviews",
7777
"ranking": 10,
78-
"numberOfReviews": 474,
78+
"numberOfReviews": 420,
7979
"rate": 4.4,
8080
"description": "Azure Virtual Machines gives you the flexibility of virtualization for a wide range of computing solutions: development and testing, running applications, and extending your datacenter with support for Linux, Windows Server, SQL Server, Oracle, IBM, and SAP."
8181
}

0 commit comments

Comments
 (0)