Skip to content

Commit fe66e3d

Browse files
committed
#171 used ScrapegraphAI library and created a more consistent example.
1 parent 753737e commit fe66e3d

File tree

1 file changed

+66
-10
lines changed

1 file changed

+66
-10
lines changed
Lines changed: 66 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,94 @@
11
import asyncio
2-
from chromium import ChromiumLoader # Import the ChromiumLoader class from chromium.py
2+
import os
3+
import json
4+
from dotenv import load_dotenv
5+
from chromium import ChromiumLoader # Import your ChromiumLoader class
6+
from scrapegraphai.graphs import SmartScraperGraph
7+
from scrapegraphai.utils import prettify_exec_info
38
from aiohttp import ClientError
49

10+
# Load environment variables for API keys
11+
load_dotenv()
512

6-
async def test_scraper(scraper: ChromiumLoader, urls: list):
13+
# ************************************************
14+
# Define function to analyze content with ScrapegraphAI
15+
# ************************************************
16+
async def analyze_content_with_scrapegraph(content: str):
717
"""
8-
Test scraper for the given backend and URLs.
18+
Analyze scraped content using ScrapegraphAI.
19+
20+
Args:
21+
content (str): The scraped HTML or text content.
22+
23+
Returns:
24+
dict: The result from ScrapegraphAI analysis.
25+
"""
26+
try:
27+
# Initialize ScrapegraphAI SmartScraperGraph
28+
smart_scraper = SmartScraperGraph(
29+
prompt="Summarize the main content of this webpage and extract any contact information.",
30+
source=content, # Pass the content directly
31+
config={
32+
"llm": {
33+
"api_key": os.getenv("OPENAI_API_KEY"),
34+
"model": "openai/gpt-4o",
35+
},
36+
"verbose": True
37+
}
38+
)
39+
result = smart_scraper.run()
40+
return result
41+
except Exception as e:
42+
print(f"❌ ScrapegraphAI analysis failed: {e}")
43+
return {"error": str(e)}
44+
45+
# ************************************************
46+
# Test scraper and ScrapegraphAI pipeline
47+
# ************************************************
48+
async def test_scraper_with_analysis(scraper: ChromiumLoader, urls: list):
49+
"""
50+
Test scraper for the given backend and URLs, then analyze content with ScrapegraphAI.
51+
952
Args:
1053
scraper (ChromiumLoader): The ChromiumLoader instance.
1154
urls (list): A list of URLs to scrape.
1255
"""
1356
for url in urls:
1457
try:
15-
print(f"Scraping: {url} using {scraper.backend}...")
58+
print(f"\n🔎 Scraping: {url} using {scraper.backend}...")
1659
result = await scraper.scrape(url)
60+
1761
if "Error" in result or not result.strip():
1862
print(f"❌ Failed to scrape {url}: {result}")
1963
else:
2064
print(f"✅ Successfully scraped {url}. Content (first 200 chars): {result[:200]}")
65+
66+
# Pass scraped content to ScrapegraphAI for analysis
67+
print("🤖 Analyzing content with ScrapegraphAI...")
68+
analysis_result = await analyze_content_with_scrapegraph(result)
69+
print("📝 Analysis Result:")
70+
print(json.dumps(analysis_result, indent=4))
71+
2172
except ClientError as ce:
2273
print(f"❌ Network error while scraping {url}: {ce}")
2374
except Exception as e:
2475
print(f"❌ Unexpected error while scraping {url}: {e}")
2576

26-
77+
# ************************************************
78+
# Main Execution
79+
# ************************************************
2780
async def main():
28-
urls_to_scrape = ["https://example.com", "https://www.python.org", "https://invalid-url.test"]
81+
urls_to_scrape = [
82+
"https://example.com",
83+
"https://www.python.org",
84+
"https://invalid-url.test"
85+
]
2986

3087
# Test with Playwright backend
3188
print("\n--- Testing Playwright Backend ---")
3289
try:
3390
scraper_playwright = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True)
34-
await test_scraper(scraper_playwright, urls_to_scrape)
91+
await test_scraper_with_analysis(scraper_playwright, urls_to_scrape)
3592
except ImportError as ie:
3693
print(f"❌ Playwright ImportError: {ie}")
3794
except Exception as e:
@@ -41,17 +98,16 @@ async def main():
4198
print("\n--- Testing Selenium Backend ---")
4299
try:
43100
scraper_selenium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True)
44-
await test_scraper(scraper_selenium, urls_to_scrape)
101+
await test_scraper_with_analysis(scraper_selenium, urls_to_scrape)
45102
except ImportError as ie:
46103
print(f"❌ Selenium ImportError: {ie}")
47104
except Exception as e:
48105
print(f"❌ Error initializing Selenium ChromiumLoader: {e}")
49106

50-
51107
if __name__ == "__main__":
52108
try:
53109
asyncio.run(main())
54110
except KeyboardInterrupt:
55-
print("Program interrupted by user.")
111+
print("Program interrupted by user.")
56112
except Exception as e:
57113
print(f"❌ Program crashed: {e}")

0 commit comments

Comments
 (0)