Skip to content

Commit 915df7e

Browse files
refactor: improve directory parsing logic and error handling in ZoomInfo scraper
1 parent 0660e53 commit 915df7e

File tree

1 file changed

+48
-14
lines changed

1 file changed

+48
-14
lines changed

zoominfo-scraper/zoominfo.py

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616
BASE_CONFIG = {
1717
# bypass zoominfo.com web scraping blocking
1818
"asp": True,
19-
# set the proxy country to US
20-
"country": "US",
19+
# set the proxy country to CA
20+
"country": "CA",
2121
}
2222

2323

@@ -28,28 +28,62 @@ def parse_company(response: ScrapeApiResponse) -> List[Dict]:
2828
data = json.loads(data)["pageData"]
2929
return data
3030

31-
3231
def parse_directory(response: ScrapeApiResponse) -> dict:
3332
"""parse zoominfo directory pages"""
3433
data = response.selector.css("script#ng-state::text").get()
35-
data = json.loads(data)
36-
companies_data = data.get("companiesSearchData", {}).get("companies", [])
37-
companies = [company.get("companyUrl") for company in companies_data]
38-
pagination_data = data.get("companiesSearchData", {}).get("paginationData", {}).get("pages", [])
39-
pagination = [page.get("url") for page in pagination_data if page.get("url")]
34+
data = json.loads(data)
35+
# Check which data source is available
36+
companies_search_data = data.get("companiesSearchData")
37+
ai_search_results = data.get("aiSearchResults")
38+
39+
if companies_search_data:
40+
# Use companiesSearchData logic
41+
companies_data = companies_search_data.get("companies", [])
42+
companies = [company.get("companyUrl") for company in companies_data]
43+
pagination_data = companies_search_data.get("paginationData", {}).get("pages", [])
44+
pagination = [page.get("url") for page in pagination_data if page.get("url")]
45+
elif ai_search_results:
46+
# Use aiSearchResults logic
47+
companies_data = ai_search_results.get("data", [])
48+
companies = [company.get("companyUrl") for company in companies_data if company.get("companyUrl")]
49+
# For aiSearchResults, derive pagination from metadata
50+
total_results = ai_search_results.get("totalResults", 0)
51+
page_num = data.get("pageNum", 1)
52+
base_url = data.get("baseUrl", "")
53+
# Calculate pagination (assuming results per page based on current data length)
54+
results_per_page = len(companies_data) if companies_data else 10
55+
if results_per_page > 0 and total_results > 0:
56+
total_pages = (total_results + results_per_page - 1) // results_per_page
57+
pagination = [f"{base_url}?pageNum={i}" for i in range(1, total_pages + 1)]
58+
else:
59+
pagination = []
60+
else:
61+
# Neither data source available
62+
companies = []
63+
pagination = []
64+
4065
return {"companies": companies, "pagination": pagination}
4166

42-
4367
async def scrape_comapnies(urls: List[str]) -> List[Dict]:
4468
"""scrape company data from zoominfo company pages"""
4569
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
4670
companies = []
4771
failed = []
48-
try:
49-
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
50-
companies.append(parse_company(response))
51-
except ScrapflyAspError:
52-
failed.append(response.context["url"])
72+
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
73+
# Check if this is a successful response or an error
74+
if isinstance(response, ScrapeApiResponse):
75+
try:
76+
companies.append(parse_company(response))
77+
except Exception as e:
78+
log.error(f"Failed to parse company data: {e}")
79+
failed.append(response.context["url"])
80+
else:
81+
# This is an error response (ApiHttpServerError, ScrapflyAspError, etc.)
82+
log.warning(f"Request failed with error: {response}")
83+
# Extract URL from the response context if available
84+
if hasattr(response, 'context') and 'url' in response.context:
85+
failed.append(response.context["url"])
86+
5387
if len(failed) != 0:
5488
log.debug(f"{len(failed)} requests are blocked, trying again with render_js enabled and residential proxies")
5589
for url in failed:

0 commit comments

Comments
 (0)