1616BASE_CONFIG = {
1717 # bypass zoominfo.com web scraping blocking
1818 "asp" : True ,
19- # set the proxy country to US
20- "country" : "US " ,
19+ # set the proxy country to CA
20+ "country" : "CA " ,
2121}
2222
2323
@@ -28,28 +28,62 @@ def parse_company(response: ScrapeApiResponse) -> List[Dict]:
2828 data = json .loads (data )["pageData" ]
2929 return data
3030
31-
3231def parse_directory (response : ScrapeApiResponse ) -> dict :
3332 """parse zoominfo directory pages"""
3433 data = response .selector .css ("script#ng-state::text" ).get ()
35- data = json .loads (data )
36- companies_data = data .get ("companiesSearchData" , {}).get ("companies" , [])
37- companies = [company .get ("companyUrl" ) for company in companies_data ]
38- pagination_data = data .get ("companiesSearchData" , {}).get ("paginationData" , {}).get ("pages" , [])
39- pagination = [page .get ("url" ) for page in pagination_data if page .get ("url" )]
34+ data = json .loads (data )
35+ # Check which data source is available
36+ companies_search_data = data .get ("companiesSearchData" )
37+ ai_search_results = data .get ("aiSearchResults" )
38+
39+ if companies_search_data :
40+ # Use companiesSearchData logic
41+ companies_data = companies_search_data .get ("companies" , [])
42+ companies = [company .get ("companyUrl" ) for company in companies_data ]
43+ pagination_data = companies_search_data .get ("paginationData" , {}).get ("pages" , [])
44+ pagination = [page .get ("url" ) for page in pagination_data if page .get ("url" )]
45+ elif ai_search_results :
46+ # Use aiSearchResults logic
47+ companies_data = ai_search_results .get ("data" , [])
48+ companies = [company .get ("companyUrl" ) for company in companies_data if company .get ("companyUrl" )]
49+ # For aiSearchResults, derive pagination from metadata
50+ total_results = ai_search_results .get ("totalResults" , 0 )
51+ page_num = data .get ("pageNum" , 1 )
52+ base_url = data .get ("baseUrl" , "" )
53+ # Calculate pagination (assuming results per page based on current data length)
54+ results_per_page = len (companies_data ) if companies_data else 10
55+ if results_per_page > 0 and total_results > 0 :
56+ total_pages = (total_results + results_per_page - 1 ) // results_per_page
57+ pagination = [f"{ base_url } ?pageNum={ i } " for i in range (1 , total_pages + 1 )]
58+ else :
59+ pagination = []
60+ else :
61+ # Neither data source available
62+ companies = []
63+ pagination = []
64+
4065 return {"companies" : companies , "pagination" : pagination }
4166
42-
4367async def scrape_comapnies (urls : List [str ]) -> List [Dict ]:
4468 """scrape company data from zoominfo company pages"""
4569 to_scrape = [ScrapeConfig (url , ** BASE_CONFIG ) for url in urls ]
4670 companies = []
4771 failed = []
48- try :
49- async for response in SCRAPFLY .concurrent_scrape (to_scrape ):
50- companies .append (parse_company (response ))
51- except ScrapflyAspError :
52- failed .append (response .context ["url" ])
72+ async for response in SCRAPFLY .concurrent_scrape (to_scrape ):
73+ # Check if this is a successful response or an error
74+ if isinstance (response , ScrapeApiResponse ):
75+ try :
76+ companies .append (parse_company (response ))
77+ except Exception as e :
78+ log .error (f"Failed to parse company data: { e } " )
79+ failed .append (response .context ["url" ])
80+ else :
81+ # This is an error response (ApiHttpServerError, ScrapflyAspError, etc.)
82+ log .warning (f"Request failed with error: { response } " )
83+ # Extract URL from the response context if available
84+ if hasattr (response , 'context' ) and 'url' in response .context :
85+ failed .append (response .context ["url" ])
86+
5387 if len (failed ) != 0 :
5488 log .debug (f"{ len (failed )} requests are blocked, trying again with render_js enabled and residential proxies" )
5589 for url in failed :
0 commit comments