File tree Expand file tree Collapse file tree 5 files changed +79
-3
lines changed Expand file tree Collapse file tree 5 files changed +79
-3
lines changed Original file line number Diff line number Diff line change 1+ ## [ 1.26.0-beta.8] ( https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.7...v1.26.0-beta.8 ) (2024-10-08)
2+
3+
4+ ### Features
5+
6+ * undected_chromedriver support ([ 80ece21] ( https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/80ece2179ac47a7ea42fbae4b61504a49ca18daa ) )
7+
18## [ 1.26.0-beta.7] ( https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.0-beta.6...v1.26.0-beta.7 ) (2024-10-07)
29
310
Original file line number Diff line number Diff line change 1+ """
2+ Basic example of scraping pipeline using SmartScraper
3+ """
4+
5+ import os
6+ from dotenv import load_dotenv
7+ from scrapegraphai .graphs import SmartScraperGraph
8+ from scrapegraphai .utils import prettify_exec_info
9+
10+ load_dotenv ()
11+
12+ # ************************************************
13+ # Define the configuration for the graph
14+ # ************************************************
15+
16+ groq_key = os .getenv ("GROQ_APIKEY" )
17+
18+ graph_config = {
19+ "llm" : {
20+ "model" : "groq/gemma-7b-it" ,
21+ "api_key" : groq_key ,
22+ "temperature" : 0
23+ },
24+ "headless" : False ,
25+ "backend" : "undetected_chromedriver"
26+ }
27+
28+ # ************************************************
29+ # Create the SmartScraperGraph instance and run it
30+ # ************************************************
31+
32+ smart_scraper_graph = SmartScraperGraph (
33+ prompt = "List me all the projects with their description." ,
34+ # also accepts a string with the already downloaded HTML code
35+ source = "https://perinim.github.io/projects/" ,
36+ config = graph_config
37+ )
38+
39+ result = smart_scraper_graph .run ()
40+ print (result )
41+
42+ # ************************************************
43+ # Get graph execution info
44+ # ************************************************
45+
46+ graph_exec_info = smart_scraper_graph .get_execution_info ()
47+ print (prettify_exec_info (graph_exec_info ))
Original file line number Diff line number Diff line change 2020 "api_key" : groq_key ,
2121 "temperature" : 0
2222 },
23- "headless" : False
23+ "headless" : False ,
2424}
2525
2626# ************************************************
Original file line number Diff line number Diff line change 11[project ]
22name = " scrapegraphai"
33
4- version = " 1.26.0b7 "
4+ version = " 1.26.0b8 "
55
66description = " A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
77authors = [
Original file line number Diff line number Diff line change @@ -57,6 +57,28 @@ def __init__(
5757 self .urls = urls
5858 self .load_state = load_state
5959
60+ async def ascrape_undetected_chromedriver (self , url : str ) -> str :
61+ """
62+ Asynchronously scrape the content of a given URL using undetected chrome with Selenium.
63+
64+ Args:
65+ url (str): The URL to scrape.
66+
67+ Returns:
68+ str: The scraped HTML content or an error message if an exception occurs.
69+
70+ """
71+ import undetected_chromedriver as uc
72+
73+ logger .info (f"Starting scraping with { self .backend } ..." )
74+ results = ""
75+ try :
76+ driver = uc .Chrome (headless = self .headless )
77+ results = driver .get (url ).page_content
78+ except Exception as e :
79+ results = f"Error: { e } "
80+ return results
81+
6082 async def ascrape_playwright (self , url : str ) -> str :
6183 """
6284 Asynchronously scrape the content of a given URL using Playwright's async API.
@@ -71,7 +93,7 @@ async def ascrape_playwright(self, url: str) -> str:
7193 from playwright .async_api import async_playwright
7294 from undetected_playwright import Malenia
7395
74- logger .info ("Starting scraping..." )
96+ logger .info (f "Starting scraping with { self . backend } ..." )
7597 results = ""
7698 async with async_playwright () as p :
7799 browser = await p .chromium .launch (
You can’t perform that action at this time.
0 commit comments