refactor: improve directory parsing logic and error handling in ZoomInfo scraper

AbdElRhmanArafa · AbdElRhmanArafa · commit 915df7e11714 · 2025-09-24T16:12:01.000Z
diff --git a/zoominfo-scraper/zoominfo.py b/zoominfo-scraper/zoominfo.py
@@ -16,8 +16,8 @@
 BASE_CONFIG = {
     # bypass zoominfo.com web scraping blocking
     "asp": True,
-    # set the proxy country to US
-    "country": "US",
+    # set the proxy country to CA
+    "country": "CA",
 }
 
 
@@ -28,28 +28,62 @@ def parse_company(response: ScrapeApiResponse) -> List[Dict]:
     data = json.loads(data)["pageData"]
     return data
 
-
 def parse_directory(response: ScrapeApiResponse) -> dict:
     """parse zoominfo directory pages"""
     data = response.selector.css("script#ng-state::text").get()
-    data = json.loads(data)
-    companies_data = data.get("companiesSearchData", {}).get("companies", [])
-    companies = [company.get("companyUrl") for company in companies_data]
-    pagination_data = data.get("companiesSearchData", {}).get("paginationData", {}).get("pages", [])
-    pagination = [page.get("url") for page in pagination_data if page.get("url")]
+    data = json.loads(data)    
+    # Check which data source is available
+    companies_search_data = data.get("companiesSearchData")
+    ai_search_results = data.get("aiSearchResults")
+    
+    if companies_search_data:
+        # Use companiesSearchData logic
+        companies_data = companies_search_data.get("companies", [])
+        companies = [company.get("companyUrl") for company in companies_data]
+        pagination_data = companies_search_data.get("paginationData", {}).get("pages", [])
+        pagination = [page.get("url") for page in pagination_data if page.get("url")]
+    elif ai_search_results:
+        # Use aiSearchResults logic
+        companies_data = ai_search_results.get("data", [])
+        companies = [company.get("companyUrl") for company in companies_data if company.get("companyUrl")]
+        # For aiSearchResults, derive pagination from metadata
+        total_results = ai_search_results.get("totalResults", 0)
+        page_num = data.get("pageNum", 1)
+        base_url = data.get("baseUrl", "")
+        # Calculate pagination (assuming results per page based on current data length)
+        results_per_page = len(companies_data) if companies_data else 10
+        if results_per_page > 0 and total_results > 0:
+            total_pages = (total_results + results_per_page - 1) // results_per_page
+            pagination = [f"{base_url}?pageNum={i}" for i in range(1, total_pages + 1)]
+        else:
+            pagination = []
+    else:
+        # Neither data source available
+        companies = []
+        pagination = []
+    
     return {"companies": companies, "pagination": pagination}
 
-
 async def scrape_comapnies(urls: List[str]) -> List[Dict]:
     """scrape company data from zoominfo company pages"""
     to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
     companies = []
     failed = []
-    try:
-        async for response in SCRAPFLY.concurrent_scrape(to_scrape):
-            companies.append(parse_company(response))
-    except ScrapflyAspError:
-        failed.append(response.context["url"])
+    async for response in SCRAPFLY.concurrent_scrape(to_scrape):
+        # Check if this is a successful response or an error
+        if isinstance(response, ScrapeApiResponse):
+            try:
+                companies.append(parse_company(response))
+            except Exception as e:
+                log.error(f"Failed to parse company data: {e}")
+                failed.append(response.context["url"])
+        else:
+            # This is an error response (ApiHttpServerError, ScrapflyAspError, etc.)
+            log.warning(f"Request failed with error: {response}")
+            # Extract URL from the response context if available
+            if hasattr(response, 'context') and 'url' in response.context:
+                failed.append(response.context["url"])
+
     if len(failed) != 0:
         log.debug(f"{len(failed)} requests are blocked, trying again with render_js enabled and residential proxies")
         for url in failed: