11import asyncio
2- from chromium import ChromiumLoader # Import the ChromiumLoader class from chromium.py
2+ import os
3+ import json
4+ from dotenv import load_dotenv
5+ from chromium import ChromiumLoader # Import your ChromiumLoader class
6+ from scrapegraphai .graphs import SmartScraperGraph
7+ from scrapegraphai .utils import prettify_exec_info
38from aiohttp import ClientError
49
10+ # Load environment variables for API keys
11+ load_dotenv ()
512
6- async def test_scraper (scraper : ChromiumLoader , urls : list ):
13+ # ************************************************
14+ # Define function to analyze content with ScrapegraphAI
15+ # ************************************************
16+ async def analyze_content_with_scrapegraph (content : str ):
717 """
8- Test scraper for the given backend and URLs.
18+ Analyze scraped content using ScrapegraphAI.
19+
20+ Args:
21+ content (str): The scraped HTML or text content.
22+
23+ Returns:
24+ dict: The result from ScrapegraphAI analysis.
25+ """
26+ try :
27+ # Initialize ScrapegraphAI SmartScraperGraph
28+ smart_scraper = SmartScraperGraph (
29+ prompt = "Summarize the main content of this webpage and extract any contact information." ,
30+ source = content , # Pass the content directly
31+ config = {
32+ "llm" : {
33+ "api_key" : os .getenv ("OPENAI_API_KEY" ),
34+ "model" : "openai/gpt-4o" ,
35+ },
36+ "verbose" : True
37+ }
38+ )
39+ result = smart_scraper .run ()
40+ return result
41+ except Exception as e :
42+ print (f"❌ ScrapegraphAI analysis failed: { e } " )
43+ return {"error" : str (e )}
44+
45+ # ************************************************
46+ # Test scraper and ScrapegraphAI pipeline
47+ # ************************************************
48+ async def test_scraper_with_analysis (scraper : ChromiumLoader , urls : list ):
49+ """
50+ Test scraper for the given backend and URLs, then analyze content with ScrapegraphAI.
51+
952 Args:
1053 scraper (ChromiumLoader): The ChromiumLoader instance.
1154 urls (list): A list of URLs to scrape.
1255 """
1356 for url in urls :
1457 try :
15- print (f"Scraping: { url } using { scraper .backend } ..." )
58+ print (f"\n 🔎 Scraping: { url } using { scraper .backend } ..." )
1659 result = await scraper .scrape (url )
60+
1761 if "Error" in result or not result .strip ():
1862 print (f"❌ Failed to scrape { url } : { result } " )
1963 else :
2064 print (f"✅ Successfully scraped { url } . Content (first 200 chars): { result [:200 ]} " )
65+
66+ # Pass scraped content to ScrapegraphAI for analysis
67+ print ("🤖 Analyzing content with ScrapegraphAI..." )
68+ analysis_result = await analyze_content_with_scrapegraph (result )
69+ print ("📝 Analysis Result:" )
70+ print (json .dumps (analysis_result , indent = 4 ))
71+
2172 except ClientError as ce :
2273 print (f"❌ Network error while scraping { url } : { ce } " )
2374 except Exception as e :
2475 print (f"❌ Unexpected error while scraping { url } : { e } " )
2576
26-
77+ # ************************************************
78+ # Main Execution
79+ # ************************************************
2780async def main ():
28- urls_to_scrape = ["https://example.com" , "https://www.python.org" , "https://invalid-url.test" ]
81+ urls_to_scrape = [
82+ "https://example.com" ,
83+ "https://www.python.org" ,
84+ "https://invalid-url.test"
85+ ]
2986
3087 # Test with Playwright backend
3188 print ("\n --- Testing Playwright Backend ---" )
3289 try :
3390 scraper_playwright = ChromiumLoader (urls = urls_to_scrape , backend = "playwright" , headless = True )
34- await test_scraper (scraper_playwright , urls_to_scrape )
91+ await test_scraper_with_analysis (scraper_playwright , urls_to_scrape )
3592 except ImportError as ie :
3693 print (f"❌ Playwright ImportError: { ie } " )
3794 except Exception as e :
@@ -41,17 +98,16 @@ async def main():
4198 print ("\n --- Testing Selenium Backend ---" )
4299 try :
43100 scraper_selenium = ChromiumLoader (urls = urls_to_scrape , backend = "selenium" , headless = True )
44- await test_scraper (scraper_selenium , urls_to_scrape )
101+ await test_scraper_with_analysis (scraper_selenium , urls_to_scrape )
45102 except ImportError as ie :
46103 print (f"❌ Selenium ImportError: { ie } " )
47104 except Exception as e :
48105 print (f"❌ Error initializing Selenium ChromiumLoader: { e } " )
49106
50-
51107if __name__ == "__main__" :
52108 try :
53109 asyncio .run (main ())
54110 except KeyboardInterrupt :
55- print ("Program interrupted by user." )
111+ print ("❌ Program interrupted by user." )
56112 except Exception as e :
57113 print (f"❌ Program crashed: { e } " )
0 commit comments