Skip to content

Commit 4dd3479

Browse files
fix: update country code to US and enhance salary parsing logic in Glassdoor scraper
1 parent 18da4eb commit 4dd3479

File tree

1 file changed

+74
-4
lines changed

1 file changed

+74
-4
lines changed

glassdoor-scraper/glassdoor.py

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
# Glassdoor.com requires Anti Scraping Protection bypass feature.
2121
# for more: https://scrapfly.io/docs/scrape-api/anti-scraping-protection
2222
"asp": True,
23-
"country": "GB",
23+
"country": "US",
2424
"render_js": True,
2525
}
2626

@@ -148,9 +148,79 @@ async def scrape_reviews(url: str, max_pages: Optional[int] = None) -> Dict:
148148

149149
def parse_salaries(result: ScrapeApiResponse) -> Dict:
150150
"""Parse Glassdoor salaries page for salary data"""
151-
cache = find_hidden_data(result)
152-
salaries = next(v for k, v in cache.items() if k.startswith("aggregatedSalaryEstimates") and v.get("results"))
153-
return salaries
151+
152+
salary_data = {
153+
"results": [],
154+
"numPages": 1,
155+
"salaryCount": 0,
156+
"jobTitleCount": 0
157+
}
158+
159+
salary_items = result.selector.css('[data-test="salary-item"]')
160+
161+
for item in salary_items:
162+
job_title = item.css('.SalaryItem_jobTitle__XWGpT::text').get()
163+
if not job_title:
164+
continue
165+
166+
salary_range = item.css('.SalaryItem_salaryRange__UL9vQ::text').get()
167+
salary_count_text = item.css('.SalaryItem_salaryCount__GT665::text').get() or ""
168+
169+
salary_count = 0
170+
if "Salaries submitted" in salary_count_text:
171+
try:
172+
salary_count = int(salary_count_text.split()[0])
173+
except (ValueError, IndexError):
174+
pass
175+
176+
salary_item = {
177+
"jobTitle": {
178+
"text": job_title,
179+
},
180+
"salaryCount": salary_count,
181+
"basePayStatistics": {
182+
"percentiles": []
183+
}
184+
}
185+
186+
# Parse salary range
187+
if salary_range:
188+
range_clean = salary_range.replace('$', '').replace('K', '000')
189+
if ' - ' in range_clean:
190+
try:
191+
min_str, max_str = range_clean.split(' - ')
192+
min_salary = float(min_str.replace(',', ''))
193+
max_salary = float(max_str.replace(',', ''))
194+
salary_item["basePayStatistics"]["percentiles"] = [
195+
{"ident": "min", "value": min_salary},
196+
{"ident": "max", "value": max_salary}
197+
]
198+
except ValueError:
199+
pass
200+
201+
salary_data["results"].append(salary_item)
202+
203+
# Extract pagination from HTML
204+
page_links = result.selector.css('.pagination_PageNumberText__F7427::text').getall()
205+
if page_links:
206+
try:
207+
salary_data["numPages"] = max(int(page) for page in page_links if page.isdigit())
208+
except ValueError:
209+
pass
210+
211+
# Extract job title count from HTML
212+
result_count_text = result.selector.css('.SortBar_SearchCount__cYwt6::text').get() or ""
213+
if "job titles" in result_count_text:
214+
try:
215+
count_str = result_count_text.split()[0]
216+
salary_data["jobTitleCount"] = int(count_str.replace(',', ''))
217+
except (ValueError, IndexError):
218+
pass
219+
220+
salary_data["salaryCount"] = len(salary_data["results"])
221+
222+
log.info(f"Parsed {len(salary_data['results'])} salary items")
223+
return salary_data
154224

155225

156226
async def scrape_salaries(url: str, max_pages: Optional[int] = None) -> Dict:

0 commit comments

Comments
 (0)