File tree Expand file tree Collapse file tree 5 files changed +66
-5
lines changed Expand file tree Collapse file tree 5 files changed +66
-5
lines changed Original file line number Diff line number Diff line change 1+ """
2+ Basic example of scraping pipeline using SmartScraper
3+ """
4+ import os
5+ import json
6+ from dotenv import load_dotenv
7+ from scrapegraphai .graphs import SmartScraperGraph
8+ from scrapegraphai .utils import prettify_exec_info
9+
10+ load_dotenv ()
11+
12+ # ************************************************
13+ # Define the configuration for the graph
14+ # ************************************************
15+
16+
17+ graph_config = {
18+ "llm" : {
19+ "model" : "scrapegraphai/smart-scraper" ,
20+ "api_key" : os .getenv ("SCRAPEGRAPH_API_KEY" )
21+ },
22+ "verbose" : True ,
23+ "headless" : False ,
24+ }
25+
26+ # ************************************************
27+ # Create the SmartScraperGraph instance and run it
28+ # ************************************************
29+
30+ smart_scraper_graph = SmartScraperGraph (
31+ prompt = "Extract me all the articles" ,
32+ source = "https://www.wired.com" ,
33+ config = graph_config
34+ )
35+
36+ result = smart_scraper_graph .run ()
37+ print (json .dumps (result , indent = 4 ))
38+
39+ # ************************************************
40+ # Get graph execution info
41+ # ************************************************
42+
43+ graph_exec_info = smart_scraper_graph .get_execution_info ()
44+ print (prettify_exec_info (graph_exec_info ))
Original file line number Diff line number Diff line change @@ -43,7 +43,8 @@ dependencies = [
4343 " transformers>=4.44.2" ,
4444 " googlesearch-python>=1.2.5" ,
4545 " simpleeval>=1.0.0" ,
46- " async_timeout>=4.0.3"
46+ " async_timeout>=4.0.3" ,
47+ " scrapegraph-py>=0.0.3"
4748]
4849
4950license = " MIT"
Original file line number Diff line number Diff line change @@ -353,7 +353,7 @@ pyasn1==0.6.0
353353 # via rsa
354354pyasn1-modules==0.4.0
355355 # via google-auth
356- pydantic==2.8.2
356+ pydantic==2.10.1
357357 # via burr
358358 # via fastapi
359359 # via fastapi-pagination
@@ -368,7 +368,8 @@ pydantic==2.8.2
368368 # via openai
369369 # via pydantic-settings
370370 # via qdrant-client
371- pydantic-core==2.20.1
371+ # via scrapegraph-py
372+ pydantic-core==2.27.1
372373 # via pydantic
373374pydantic-settings==2.5.2
374375 # via langchain-community
@@ -396,6 +397,7 @@ python-dateutil==2.9.0.post0
396397 # via pandas
397398python-dotenv==1.0.1
398399 # via pydantic-settings
400+ # via scrapegraph-py
399401 # via scrapegraphai
400402pytz==2024.1
401403 # via pandas
@@ -424,6 +426,7 @@ requests==2.32.3
424426 # via langchain-community
425427 # via langsmith
426428 # via mistral-common
429+ # via scrapegraph-py
427430 # via sphinx
428431 # via streamlit
429432 # via tiktoken
@@ -439,6 +442,8 @@ s3transfer==0.10.2
439442 # via boto3
440443safetensors==0.4.5
441444 # via transformers
445+ scrapegraph-py==0.0.3
446+ # via scrapegraphai
442447semchunk==2.2.0
443448 # via scrapegraphai
444449sentencepiece==0.2.0
Original file line number Diff line number Diff line change @@ -257,7 +257,7 @@ pyasn1==0.6.0
257257 # via rsa
258258pyasn1-modules==0.4.0
259259 # via google-auth
260- pydantic==2.8.2
260+ pydantic==2.10.1
261261 # via google-generativeai
262262 # via langchain
263263 # via langchain-aws
@@ -269,7 +269,8 @@ pydantic==2.8.2
269269 # via openai
270270 # via pydantic-settings
271271 # via qdrant-client
272- pydantic-core==2.20.1
272+ # via scrapegraph-py
273+ pydantic-core==2.27.1
273274 # via pydantic
274275pydantic-settings==2.5.2
275276 # via langchain-community
@@ -286,6 +287,7 @@ python-dateutil==2.9.0.post0
286287 # via pandas
287288python-dotenv==1.0.1
288289 # via pydantic-settings
290+ # via scrapegraph-py
289291 # via scrapegraphai
290292pytz==2024.1
291293 # via pandas
@@ -313,6 +315,7 @@ requests==2.32.3
313315 # via langchain-community
314316 # via langsmith
315317 # via mistral-common
318+ # via scrapegraph-py
316319 # via tiktoken
317320 # via transformers
318321rpds-py==0.20.0
@@ -324,6 +327,8 @@ s3transfer==0.10.2
324327 # via boto3
325328safetensors==0.4.5
326329 # via transformers
330+ scrapegraph-py==0.0.3
331+ # via scrapegraphai
327332semchunk==2.2.0
328333 # via scrapegraphai
329334sentencepiece==0.2.0
Original file line number Diff line number Diff line change 1313 ConditionalNode
1414)
1515from ..prompts import REGEN_ADDITIONAL_INFO
16+ from scrapegraph_py import ScrapeGraphClient , smart_scraper
1617
1718class SmartScraperGraph (AbstractGraph ):
1819 """
@@ -59,6 +60,11 @@ def _create_graph(self) -> BaseGraph:
5960 Returns:
6061 BaseGraph: A graph instance representing the web scraping workflow.
6162 """
63+ if self .llm_model == "scrapegraphai/smart-scraper" :
64+ client = ScrapeGraphClient (self .config .get ("api_key" ))
65+
66+ result = smart_scraper (client , self .source , self .prompt )
67+ return result
6268
6369 fetch_node = FetchNode (
6470 input = "url| local_dir" ,
You can’t perform that action at this time.
0 commit comments