|
20 | 20 | # Glassdoor.com requires Anti Scraping Protection bypass feature. |
21 | 21 | # for more: https://scrapfly.io/docs/scrape-api/anti-scraping-protection |
22 | 22 | "asp": True, |
23 | | - "country": "GB", |
| 23 | + "country": "US", |
24 | 24 | "render_js": True, |
25 | 25 | } |
26 | 26 |
|
@@ -148,9 +148,79 @@ async def scrape_reviews(url: str, max_pages: Optional[int] = None) -> Dict: |
148 | 148 |
|
149 | 149 | def parse_salaries(result: ScrapeApiResponse) -> Dict: |
150 | 150 | """Parse Glassdoor salaries page for salary data""" |
151 | | - cache = find_hidden_data(result) |
152 | | - salaries = next(v for k, v in cache.items() if k.startswith("aggregatedSalaryEstimates") and v.get("results")) |
153 | | - return salaries |
| 151 | + |
| 152 | + salary_data = { |
| 153 | + "results": [], |
| 154 | + "numPages": 1, |
| 155 | + "salaryCount": 0, |
| 156 | + "jobTitleCount": 0 |
| 157 | + } |
| 158 | + |
| 159 | + salary_items = result.selector.css('[data-test="salary-item"]') |
| 160 | + |
| 161 | + for item in salary_items: |
| 162 | + job_title = item.css('.SalaryItem_jobTitle__XWGpT::text').get() |
| 163 | + if not job_title: |
| 164 | + continue |
| 165 | + |
| 166 | + salary_range = item.css('.SalaryItem_salaryRange__UL9vQ::text').get() |
| 167 | + salary_count_text = item.css('.SalaryItem_salaryCount__GT665::text').get() or "" |
| 168 | + |
| 169 | + salary_count = 0 |
| 170 | + if "Salaries submitted" in salary_count_text: |
| 171 | + try: |
| 172 | + salary_count = int(salary_count_text.split()[0]) |
| 173 | + except (ValueError, IndexError): |
| 174 | + pass |
| 175 | + |
| 176 | + salary_item = { |
| 177 | + "jobTitle": { |
| 178 | + "text": job_title, |
| 179 | + }, |
| 180 | + "salaryCount": salary_count, |
| 181 | + "basePayStatistics": { |
| 182 | + "percentiles": [] |
| 183 | + } |
| 184 | + } |
| 185 | + |
| 186 | + # Parse salary range |
| 187 | + if salary_range: |
| 188 | + range_clean = salary_range.replace('$', '').replace('K', '000') |
| 189 | + if ' - ' in range_clean: |
| 190 | + try: |
| 191 | + min_str, max_str = range_clean.split(' - ') |
| 192 | + min_salary = float(min_str.replace(',', '')) |
| 193 | + max_salary = float(max_str.replace(',', '')) |
| 194 | + salary_item["basePayStatistics"]["percentiles"] = [ |
| 195 | + {"ident": "min", "value": min_salary}, |
| 196 | + {"ident": "max", "value": max_salary} |
| 197 | + ] |
| 198 | + except ValueError: |
| 199 | + pass |
| 200 | + |
| 201 | + salary_data["results"].append(salary_item) |
| 202 | + |
| 203 | + # Extract pagination from HTML |
| 204 | + page_links = result.selector.css('.pagination_PageNumberText__F7427::text').getall() |
| 205 | + if page_links: |
| 206 | + try: |
| 207 | + salary_data["numPages"] = max(int(page) for page in page_links if page.isdigit()) |
| 208 | + except ValueError: |
| 209 | + pass |
| 210 | + |
| 211 | + # Extract job title count from HTML |
| 212 | + result_count_text = result.selector.css('.SortBar_SearchCount__cYwt6::text').get() or "" |
| 213 | + if "job titles" in result_count_text: |
| 214 | + try: |
| 215 | + count_str = result_count_text.split()[0] |
| 216 | + salary_data["jobTitleCount"] = int(count_str.replace(',', '')) |
| 217 | + except (ValueError, IndexError): |
| 218 | + pass |
| 219 | + |
| 220 | + salary_data["salaryCount"] = len(salary_data["results"]) |
| 221 | + |
| 222 | + log.info(f"Parsed {len(salary_data['results'])} salary items") |
| 223 | + return salary_data |
154 | 224 |
|
155 | 225 |
|
156 | 226 | async def scrape_salaries(url: str, max_pages: Optional[int] = None) -> Dict: |
|
0 commit comments