ScrapeGraphAI
diff --git a/‎examples/extras/.env.example‎
Lines changed: 4 additions & 0 deletions b/‎examples/extras/.env.example‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/extras/Savedscreenshots/test_image.jpeg‎
174 KB b/‎examples/extras/Savedscreenshots/test_image.jpeg‎
174 KB
diff --git a/‎examples/extras/authenticated_playwright.py‎
Lines changed: 93 additions & 0 deletions b/‎examples/extras/authenticated_playwright.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎examples/extras/browser_base_integration.py‎
Lines changed: 49 additions & 0 deletions b/‎examples/extras/browser_base_integration.py‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎examples/extras/chromium_selenium.py‎
Lines changed: 119 additions & 0 deletions b/‎examples/extras/chromium_selenium.py‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎examples/extras/cond_smartscraper_usage.py‎
Lines changed: 38 additions & 0 deletions b/‎examples/extras/cond_smartscraper_usage.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎examples/extras/conditional_usage.py‎
Lines changed: 41 additions & 0 deletions b/‎examples/extras/conditional_usage.py‎
Lines changed: 41 additions & 0 deletions
@@ -0,0 +1,4 @@
+OPENAI_API_KEY="YOUR_OPENAI_API_KEY"
+BROWSER_BASE_PROJECT_ID="YOUR_BROWSER_BASE_PROJECT_ID"
+BROWSER_BASE_API_KEY="YOUR_BROWSERBASE_API_KEY"
+SCRAPE_DO_API_KEY="YOUR_SCRAPE_DO_API_KEY"
@@ -0,0 +1,93 @@
+"""
+Example leveraging a state file containing session cookies which
+might be leveraged to authenticate to a website and scrape protected
+content.
+"""
+
+import os
+import random
+from dotenv import load_dotenv
+
+# import playwright so we can use it to create the state file
+from playwright.async_api import async_playwright
+
+from scrapegraphai.graphs import OmniScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Leveraging Playwright external to the invocation of the graph to
+# login and create the state file
+# ************************************************
+
+
+# note this is just an example and probably won't actually work on
+# LinkedIn, the implementation of the login is highly dependent on the website
+async def do_login():
+    async with async_playwright() as playwright:
+        browser = await playwright.chromium.launch(
+            timeout=30000,
+            headless=False,
+            slow_mo=random.uniform(500, 1500),
+        )
+    page = await browser.new_page()
+
+    # very basic implementation of a login, in reality it may be trickier
+    await page.goto("https://www.linkedin.com/login")
+    await page.get_by_label("Email or phone").fill("some_bloke@some_domain.com")
+    await page.get_by_label("Password").fill("test1234")
+    await page.get_by_role("button", name="Sign in").click()
+    await page.wait_for_timeout(3000)
+
+    # assuming a successful login, we save the cookies to a file
+    await page.context.storage_state(path="./state.json")
+
+
+async def main():
+    await do_login()
+
+    # ************************************************
+    # Define the configuration for the graph
+    # ************************************************
+
+    openai_api_key = os.getenv("OPENAI_APIKEY")
+
+    graph_config = {
+        "llm": {
+            "api_key": openai_api_key,
+            "model": "openai/gpt-4o",
+        },
+        "max_images": 10,
+        "headless": False,
+        # provide the path to the state file
+        "storage_state": "./state.json",
+    }
+
+    # ************************************************
+    # Create the OmniScraperGraph instance and run it
+    # ************************************************
+
+    omni_scraper_graph = OmniScraperGraph(
+        prompt="List me all the projects with their description.",
+        source="https://www.linkedin.com/feed/",
+        config=graph_config,
+    )
+
+    # the storage_state is used to load the cookies from the state file
+    # so we are authenticated and able to scrape protected content
+    result = omni_scraper_graph.run()
+    print(result)
+
+    # ************************************************
+    # Get graph execution info
+    # ************************************************
+
+    graph_exec_info = omni_scraper_graph.get_execution_info()
+    print(prettify_exec_info(graph_exec_info))
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(main())
@@ -0,0 +1,49 @@
+""" 
+Basic example of scraping pipeline using SmartScraper
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "model": "openai/gpt-4o",
+    },
+    "browser_base": {
+        "api_key": os.getenv("BROWSER_BASE_API_KEY"),
+        "project_id": os.getenv("BROWSER_BASE_PROJECT_ID"),
+    },
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me what does the company do, the name and a contact email.",
+    source="https://scrapegraphai.com/",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(json.dumps(result, indent=4))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
@@ -0,0 +1,119 @@
+import asyncio
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.docloaders.chromium import ChromiumLoader  # Import your ChromiumLoader class
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+from aiohttp import ClientError
+
+# Load environment variables for API keys
+load_dotenv()
+
+# ************************************************
+# Define function to analyze content with ScrapegraphAI
+# ************************************************
+async def analyze_content_with_scrapegraph(content: str):
+    """
+    Analyze scraped content using ScrapegraphAI.
+    
+    Args:
+        content (str): The scraped HTML or text content.
+
+    Returns:
+        dict: The result from ScrapegraphAI analysis.
+    """
+    try:
+        # Initialize ScrapegraphAI SmartScraperGraph
+        smart_scraper = SmartScraperGraph(
+            prompt="Summarize the main content of this webpage and extract any contact information.",
+            source=content,  # Pass the content directly
+            config={
+                "llm": {
+                    "api_key": os.getenv("OPENAI_API_KEY"),
+                    "model": "openai/gpt-4o",
+                },
+                "verbose": True
+            }
+        )
+        result = smart_scraper.run()
+        return result
+    except Exception as e:
+        print(f"❌ ScrapegraphAI analysis failed: {e}")
+        return {"error": str(e)}
+
+# ************************************************
+# Test scraper and ScrapegraphAI pipeline
+# ************************************************
+async def test_scraper_with_analysis(scraper: ChromiumLoader, urls: list):
+    """
+    Test scraper for the given backend and URLs, then analyze content with ScrapegraphAI.
+
+    Args:
+        scraper (ChromiumLoader): The ChromiumLoader instance.
+        urls (list): A list of URLs to scrape.
+    """
+    for url in urls:
+        try:
+            print(f"\n🔎 Scraping: {url} using {scraper.backend}...")
+            result = await scraper.scrape(url)
+
+            if "Error" in result or not result.strip():
+                print(f"❌ Failed to scrape {url}: {result}")
+            else:
+                print(f"✅ Successfully scraped {url}. Content (first 200 chars): {result[:200]}")
+
+                # Pass scraped content to ScrapegraphAI for analysis
+                print("🤖 Analyzing content with ScrapegraphAI...")
+                analysis_result = await analyze_content_with_scrapegraph(result)
+                print("📝 Analysis Result:")
+                print(json.dumps(analysis_result, indent=4))
+
+        except ClientError as ce:
+            print(f"❌ Network error while scraping {url}: {ce}")
+        except Exception as e:
+            print(f"❌ Unexpected error while scraping {url}: {e}")
+
+# ************************************************
+# Main Execution
+# ************************************************
+async def main():
+    urls_to_scrape = [
+        "https://example.com",
+        "https://www.python.org",
+        "https://invalid-url.test"
+    ]
+
+    # Test with Playwright backend
+    print("\n--- Testing Playwright Backend ---")
+    try:
+        scraper_playwright_chromium = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "chromium")
+        await test_scraper_with_analysis(scraper_playwright_chromium, urls_to_scrape)
+        
+        scraper_playwright_firefox = ChromiumLoader(urls=urls_to_scrape, backend="playwright", headless=True, browser_name = "firefox")
+        await test_scraper_with_analysis(scraper_playwright_firefox, urls_to_scrape)
+    except ImportError as ie:
+        print(f"❌ Playwright ImportError: {ie}")
+    except Exception as e:
+        print(f"❌ Error initializing Playwright ChromiumLoader: {e}")
+
+    # Test with Selenium backend
+    print("\n--- Testing Selenium Backend ---")
+    try:
+        scraper_selenium_chromium = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "chromium")
+        await test_scraper_with_analysis(scraper_selenium_chromium, urls_to_scrape)
+        
+        scraper_selenium_firefox = ChromiumLoader(urls=urls_to_scrape, backend="selenium", headless=True, browser_name = "firefox")
+        await test_scraper_with_analysis(scraper_selenium_firefox, urls_to_scrape)
+    except ImportError as ie:
+        print(f"❌ Selenium ImportError: {ie}")
+    except Exception as e:
+        print(f"❌ Error initializing Selenium ChromiumLoader: {e}")
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        print("❌ Program interrupted by user.")
+    except Exception as e:
+        print(f"❌ Program crashed: {e}")
@@ -0,0 +1,38 @@
+"""
+Basic example of scraping pipeline using SmartScraperMultiConcatGraph with Groq
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("GROQ_APIKEY"),
+        "model": "groq/gemma-7b-it",
+    },
+    "verbose": True,
+    "headless": True,
+    "reattempt": True #Setting this to True will allow the graph to reattempt the scraping process
+}
+
+# *******************************************************
+# Create the SmartScraperMultiCondGraph instance and run it
+# *******************************************************
+
+multiple_search_graph = SmartScraperGraph(
+    prompt="Who is Marco Perini?",
+    source="https://perinim.github.io/",
+    schema=None,
+    config=graph_config
+)
+
+result = multiple_search_graph.run()
+print(json.dumps(result, indent=4))
@@ -0,0 +1,41 @@
+"""
+Basic example of scraping pipeline using SmartScraperMultiConcatGraph with Groq
+"""
+
+import os
+import json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperMultiGraph
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "api_key": os.getenv("OPENAI_API_KEY"),
+        "model": "openai/gpt-4o",
+    },
+
+    "verbose": True,
+    "headless": False,
+}
+
+# *******************************************************
+# Create the SmartScraperMultiCondGraph instance and run it
+# *******************************************************
+
+multiple_search_graph = SmartScraperMultiGraph(
+    prompt="Who is Marco Perini?",
+    source=[
+        "https://perinim.github.io/",
+        "https://perinim.github.io/cv/"
+    ],
+    schema=None,
+    config=graph_config
+)
+
+result = multiple_search_graph.run()
+print(json.dumps(result, indent=4))