11import asyncio
2- from typing import List
3- from pydantic import BaseModel
2+ from typing import List , Dict , Any
43
54from scrapegraph_py import AsyncClient
65from scrapegraph_py .logger import sgai_logger
76
87sgai_logger .set_logging (level = "INFO" )
98
10- # Define the output schema
11- class Company (BaseModel ):
12- name : str
13- category : str
14- location : str
15-
16- class CompaniesResponse (BaseModel ):
17- companies : List [Company ]
189
1910async def scrape_companies (client : AsyncClient , url : str , batch : str ) -> None :
2011 """Scrape companies from a specific YC batch with infinite scroll."""
2112 try :
13+ # Initial scrape with infinite scroll enabled
2214 response = await client .smartscraper (
23- website_url = f"{ url } ?batch={ batch } " ,
24- user_prompt = "Extract all company names and their categories from the page" ,
25- output_schema = CompaniesResponse ,
26- number_of_scrolls = 10 # Scroll 10 times to load more companies
15+ website_url = url ,
16+ user_prompt = "Extract all company information from this page, including name, description, and website" ,
17+ infinite_scroll = True ,
18+ scroll_options = {
19+ "max_scrolls" : 10 , # Adjust based on page size
20+ "scroll_delay" : 2 , # Seconds between scrolls
21+ "scroll_to_bottom" : True
22+ }
2723 )
28-
29- # Parse and print the results
30- result = CompaniesResponse .model_validate (response ['result' ])
31- print (f"\n Companies from { batch } batch:" )
32- print ("=" * 80 )
33- for company in result .companies :
34- print (f"Name: { company .name } " )
35- print (f"Category: { company .category } " )
36- print (f"Location: { company .location } " )
37- print ("-" * 80 )
38-
24+
25+ # Process the results
26+ companies = response .get ("result" , [])
27+ if not companies :
28+ print (f"No companies found for batch { batch } " )
29+ return
30+
31+ # Save or process the companies data
32+ print (f"Found { len (companies )} companies in batch { batch } " )
33+ for company in companies :
34+ print (f"Company: { company .get ('name' , 'N/A' )} " )
35+ print (f"Description: { company .get ('description' , 'N/A' )} " )
36+ print (f"Website: { company .get ('website' , 'N/A' )} " )
37+ print ("-" * 50 )
38+
3939 except Exception as e :
40- print (f"Error scraping { batch } batch: { e } " )
40+ print (f"Error scraping batch { batch } : { str (e )} " )
41+
4142
4243async def main ():
4344 # Initialize async client
44- sgai_client = AsyncClient (api_key = "your-api-key-here" )
45-
45+ client = AsyncClient (api_key = "your-api-key-here" )
46+
4647 try :
47- # Define batches to scrape
48- base_url = "https://www.ycombinator.com/companies"
49- batches = [
50- "Spring%202025" ,
51- "Winter%202025" ,
52- "Summer%202024"
53- ]
54-
48+ # Example YC batch URLs
49+ batch_urls = {
50+ "W24" : "https://www.ycombinator.com/companies?batch=W24" ,
51+ "S23" : "https://www.ycombinator.com/companies?batch=S23"
52+ }
53+
5554 # Create tasks for each batch
5655 tasks = [
57- scrape_companies (sgai_client , base_url , batch )
58- for batch in batches
56+ scrape_companies (client , url , batch )
57+ for batch , url in batch_urls . items ()
5958 ]
60-
61- # Execute all scraping tasks concurrently
59+
60+ # Execute all batch scraping concurrently
6261 await asyncio .gather (* tasks )
63-
64- except Exception as e :
65- print (f"An error occurred: { e } " )
66-
62+
6763 finally :
68- await sgai_client .close ()
64+ # Ensure client is properly closed
65+ await client .close ()
66+
6967
7068if __name__ == "__main__" :
7169 asyncio .run (main ())
0 commit comments