Skip to content

Commit ec0bd23

Browse files
committed
add extras
1 parent 0b582be commit ec0bd23

21 files changed

+937
-0
lines changed

examples/extras/.env.example

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
OPENAI_API_KEY="YOUR_OPENAI_API_KEY"
2+
BROWSER_BASE_PROJECT_ID="YOUR_BROWSER_BASE_PROJECT_ID"
3+
BROWSER_BASE_API_KEY="YOUR_BROWSERBASE_API_KEY"
4+
SCRAPE_DO_API_KEY="YOUR_SCRAPE_DO_API_KEY"
174 KB
Loading
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""
2+
Example leveraging a state file containing session cookies which
3+
might be leveraged to authenticate to a website and scrape protected
4+
content.
5+
"""
6+
7+
import os
8+
import random
9+
from dotenv import load_dotenv
10+
11+
# import playwright so we can use it to create the state file
12+
from playwright.async_api import async_playwright
13+
14+
from scrapegraphai.graphs import OmniScraperGraph
15+
from scrapegraphai.utils import prettify_exec_info
16+
17+
load_dotenv()
18+
19+
# ************************************************
20+
# Leveraging Playwright external to the invocation of the graph to
21+
# login and create the state file
22+
# ************************************************
23+
24+
25+
# note this is just an example and probably won't actually work on
26+
# LinkedIn, the implementation of the login is highly dependent on the website
27+
async def do_login():
28+
async with async_playwright() as playwright:
29+
browser = await playwright.chromium.launch(
30+
timeout=30000,
31+
headless=False,
32+
slow_mo=random.uniform(500, 1500),
33+
)
34+
page = await browser.new_page()
35+
36+
# very basic implementation of a login, in reality it may be trickier
37+
await page.goto("https://www.linkedin.com/login")
38+
await page.get_by_label("Email or phone").fill("some_bloke@some_domain.com")
39+
await page.get_by_label("Password").fill("test1234")
40+
await page.get_by_role("button", name="Sign in").click()
41+
await page.wait_for_timeout(3000)
42+
43+
# assuming a successful login, we save the cookies to a file
44+
await page.context.storage_state(path="./state.json")
45+
46+
47+
async def main():
48+
await do_login()
49+
50+
# ************************************************
51+
# Define the configuration for the graph
52+
# ************************************************
53+
54+
openai_api_key = os.getenv("OPENAI_APIKEY")
55+
56+
graph_config = {
57+
"llm": {
58+
"api_key": openai_api_key,
59+
"model": "openai/gpt-4o",
60+
},
61+
"max_images": 10,
62+
"headless": False,
63+
# provide the path to the state file
64+
"storage_state": "./state.json",
65+
}
66+
67+
# ************************************************
68+
# Create the OmniScraperGraph instance and run it
69+
# ************************************************
70+
71+
omni_scraper_graph = OmniScraperGraph(
72+
prompt="List me all the projects with their description.",
73+
source="https://www.linkedin.com/feed/",
74+
config=graph_config,
75+
)
76+
77+
# the storage_state is used to load the cookies from the state file
78+
# so we are authenticated and able to scrape protected content
79+
result = omni_scraper_graph.run()
80+
print(result)
81+
82+
# ************************************************
83+
# Get graph execution info
84+
# ************************************************
85+
86+
graph_exec_info = omni_scraper_graph.get_execution_info()
87+
print(prettify_exec_info(graph_exec_info))
88+
89+
90+
if __name__ == "__main__":
91+
import asyncio
92+
93+
asyncio.run(main())
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperGraph
9+
from scrapegraphai.utils import prettify_exec_info
10+
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
18+
graph_config = {
19+
"llm": {
20+
"api_key": os.getenv("OPENAI_API_KEY"),
21+
"model": "openai/gpt-4o",
22+
},
23+
"browser_base": {
24+
"api_key": os.getenv("BROWSER_BASE_API_KEY"),
25+
"project_id": os.getenv("BROWSER_BASE_PROJECT_ID"),
26+
},
27+
"verbose": True,
28+
"headless": False,
29+
}
30+
31+
# ************************************************
32+
# Create the SmartScraperGraph instance and run it
33+
# ************************************************
34+
35+
smart_scraper_graph = SmartScraperGraph(
36+
prompt="List me what does the company do, the name and a contact email.",
37+
source="https://scrapegraphai.com/",
38+
config=graph_config
39+
)
40+
41+
result = smart_scraper_graph.run()
42+
print(json.dumps(result, indent=4))
43+
44+
# ************************************************
45+
# Get graph execution info
46+
# ************************************************
47+
48+
graph_exec_info = smart_scraper_graph.get_execution_info()
49+
print(prettify_exec_info(graph_exec_info))
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import asyncio
2+
import os
3+
import json
4+
from dotenv import load_dotenv
5+
from scrapegraphai.docloaders.chromium import ChromiumLoader # Import your ChromiumLoader class
6+
from scrapegraphai.graphs import SmartScraperGraph
7+
from scrapegraphai.utils import prettify_exec_info
8+
from aiohttp import ClientError
9+
10+
# Load environment variables for API keys
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Define function to analyze content with ScrapegraphAI
15+
# ************************************************
16+
async def analyze_content_with_scrapegraph(content: str):
17+
"""
18+
Analyze scraped content using ScrapegraphAI.
19+
20+
Args:
21+
content (str): The scraped HTML or text content.
22+
23+
Returns:
24+
dict: The result from ScrapegraphAI analysis.
25+
"""
26+
try:
27+
# Initialize ScrapegraphAI SmartScraperGraph
28+
smart_scraper = SmartScraperGraph(
29+
prompt="Summarize the main content of this webpage and extract any contact information.",
30+
source=content, # Pass the content directly
31+
config={
32+
"llm": {
33+
"api_key": os.getenv("OPENAI_API_KEY"),
34+
"model": "openai/gpt-4o",
35+
},
36+
"verbose": True
37+
}
38+
)
39+
result = smart_scraper.run()
40+
return result
41+
except Exception as e:
42+
print(f"❌ ScrapegraphAI analysis failed: {e}")
43+
return {"error": str(e)}
44+
45+
# ************************************************
46+
# Test scraper and ScrapegraphAI pipeline
47+
# ************************************************
48+
async def test_scraper_with_analysis(scraper: ChromiumLoader, urls: list):
49+
"""
50+
Test scraper for the given backend and URLs, then analyze content with ScrapegraphAI.
51+
52+
Args:
53+
scraper (ChromiumLoader): The ChromiumLoader instance.
54+
urls (list): A list of URLs to scrape.
55+
"""
56+
for url in urls:
57+
try:
58+
print(f"\n🔎 Scraping: {url} using {scraper.backend}...")
59+
result = await scraper.scrape(url)
60+
61+
if "Error" in result or not result.strip():
62+
print(f"❌ Failed to scrape {url}: {result}")
63+
else:
64+
print(f"✅ Successfully scraped {url}. Content (first 200 chars): {result[:200]}")
65+
66+
# Pass scraped content to ScrapegraphAI for analysis
67+
print("🤖 Analyzing content with ScrapegraphAI...")
68+
analysis_result = await analyze_content_with_scrapegraph(result)
69+
print("📝 Analysis Result:")
70+
print(json.dumps(analysis_result, indent=4))
71+
72+
except ClientError as ce:
73+
print(f"❌ Network error while scraping {url}: {ce}")
74+
except Exception as e:
75+
print(f"❌ Unexpected error while scraping {url}: {e}")
76+
77+
# ************************************************
78+
# Main Execution
79+
# ************************************************
80+
async def main():
81+
urls_to_scrape = [
82+
"https://example.com",
83+
"https://www.python.org",
84+
"https://invalid-url.test"
85+
]
86+
87+
# Test with Playwright backend
88+
print("\n--- Testing Playwright Backend ---")
89+
try:
90+
scraper_playwright_chromium = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "chromium")
91+
await test_scraper_with_analysis(scraper_playwright_chromium, urls_to_scrape)
92+
93+
scraper_playwright_firefox = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "firefox")
94+
await test_scraper_with_analysis(scraper_playwright_firefox, urls_to_scrape)
95+
except ImportError as ie:
96+
print(f"❌ Playwright ImportError: {ie}")
97+
except Exception as e:
98+
print(f"❌ Error initializing Playwright ChromiumLoader: {e}")
99+
100+
# Test with Selenium backend
101+
print("\n--- Testing Selenium Backend ---")
102+
try:
103+
scraper_selenium_chromium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "chromium")
104+
await test_scraper_with_analysis(scraper_selenium_chromium, urls_to_scrape)
105+
106+
scraper_selenium_firefox = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "firefox")
107+
await test_scraper_with_analysis(scraper_selenium_firefox, urls_to_scrape)
108+
except ImportError as ie:
109+
print(f"❌ Selenium ImportError: {ie}")
110+
except Exception as e:
111+
print(f"❌ Error initializing Selenium ChromiumLoader: {e}")
112+
113+
if __name__ == "__main__":
114+
try:
115+
asyncio.run(main())
116+
except KeyboardInterrupt:
117+
print("❌ Program interrupted by user.")
118+
except Exception as e:
119+
print(f"❌ Program crashed: {e}")
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraperMultiConcatGraph with Groq
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperGraph
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
graph_config = {
17+
"llm": {
18+
"api_key": os.getenv("GROQ_APIKEY"),
19+
"model": "groq/gemma-7b-it",
20+
},
21+
"verbose": True,
22+
"headless": True,
23+
"reattempt": True #Setting this to True will allow the graph to reattempt the scraping process
24+
}
25+
26+
# *******************************************************
27+
# Create the SmartScraperMultiCondGraph instance and run it
28+
# *******************************************************
29+
30+
multiple_search_graph = SmartScraperGraph(
31+
prompt="Who is Marco Perini?",
32+
source="https://perinim.github.io/",
33+
schema=None,
34+
config=graph_config
35+
)
36+
37+
result = multiple_search_graph.run()
38+
print(json.dumps(result, indent=4))
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraperMultiConcatGraph with Groq
3+
"""
4+
5+
import os
6+
import json
7+
from dotenv import load_dotenv
8+
from scrapegraphai.graphs import SmartScraperMultiGraph
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
graph_config = {
17+
"llm": {
18+
"api_key": os.getenv("OPENAI_API_KEY"),
19+
"model": "openai/gpt-4o",
20+
},
21+
22+
"verbose": True,
23+
"headless": False,
24+
}
25+
26+
# *******************************************************
27+
# Create the SmartScraperMultiCondGraph instance and run it
28+
# *******************************************************
29+
30+
multiple_search_graph = SmartScraperMultiGraph(
31+
prompt="Who is Marco Perini?",
32+
source=[
33+
"https://perinim.github.io/",
34+
"https://perinim.github.io/cv/"
35+
],
36+
schema=None,
37+
config=graph_config
38+
)
39+
40+
result = multiple_search_graph.run()
41+
print(json.dumps(result, indent=4))

0 commit comments

Comments
 (0)