Skip to content

Commit 57fdaf9

Browse files
committed
create search_link_graph
1 parent cf3ab55 commit 57fdaf9

File tree

4 files changed

+153
-10
lines changed

4 files changed

+153
-10
lines changed
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
from scrapegraphai.graphs import SearchLinkGraph
5+
from scrapegraphai.utils import prettify_exec_info
6+
# ************************************************
7+
# Define the configuration for the graph
8+
# ************************************************
9+
10+
graph_config = {
11+
"llm": {
12+
"model": "ollama/llama3",
13+
"temperature": 0,
14+
"format": "json", # Ollama needs the format to be specified explicitly
15+
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
16+
},
17+
"embeddings": {
18+
"model": "ollama/nomic-embed-text",
19+
"temperature": 0,
20+
# "base_url": "http://localhost:11434", # set ollama URL arbitrarily
21+
},
22+
"verbose": True,
23+
"headless": False
24+
}
25+
26+
# ************************************************
27+
# Create the SearchLinkGraph instance and run it
28+
# ************************************************
29+
30+
smart_scraper_graph = SearchLinkGraph(
31+
source="https://sport.sky.it/nba?gr=www",
32+
config=graph_config
33+
)
34+
35+
result = smart_scraper_graph.run()
36+
print(result)
37+
38+
# ************************************************
39+
# Get graph execution info
40+
# ************************************************
41+
42+
graph_exec_info = smart_scraper_graph.get_execution_info()
43+
print(prettify_exec_info(graph_exec_info))

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,4 @@
2323
from .script_creator_multi_graph import ScriptCreatorMultiGraph
2424
from .markdown_scraper_graph import MDScraperGraph
2525
from .markdown_scraper_multi_graph import MDScraperMultiGraph
26+
from .search_link_graph import SearchLinkGraph
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
""" SearchLinkGraph Module """
2+
from typing import Optional
3+
import logging
4+
from pydantic import BaseModel
5+
from .base_graph import BaseGraph
6+
from .abstract_graph import AbstractGraph
7+
8+
9+
from ..nodes import ( FetchNode, ParseNode, SearchLinkNode )
10+
11+
class SearchLinkGraph(AbstractGraph):
12+
"""
13+
SearchLinkGraph is a scraping pipeline that automates the process of extracting information from web pages using a natural language model to interpret and answer prompts.
14+
15+
Attributes:
16+
prompt (str): The prompt for the graph.
17+
source (str): The source of the graph.
18+
config (dict): Configuration parameters for the graph.
19+
schema (BaseModel): The schema for the graph output.
20+
llm_model: An instance of a language model client, configured for generating answers.
21+
embedder_model: An instance of an embedding model client,
22+
configured for generating embeddings.
23+
verbose (bool): A flag indicating whether to show print statements during execution.
24+
headless (bool): A flag indicating whether to run the graph in headless mode.
25+
26+
Args:
27+
source (str): The source of the graph.
28+
config (dict): Configuration parameters for the graph.
29+
schema (BaseModel, optional): The schema for the graph output. Defaults to None.
30+
31+
Example:
32+
>>> smart_scraper = SearchLinkGraph(
33+
... "List me all the attractions in Chioggia.",
34+
... "https://en.wikipedia.org/wiki/Chioggia",
35+
... {"llm": {"model": "gpt-3.5-turbo"}}
36+
... )
37+
>>> result = smart_scraper.run()
38+
"""
39+
40+
def __init__(self, source: str, config: dict, schema: Optional[BaseModel] = None):
41+
super().__init__("", config, source, schema)
42+
43+
self.input_key = "url" if source.startswith("http") else "local_dir"
44+
45+
def _create_graph(self) -> BaseGraph:
46+
"""
47+
Creates the graph of nodes representing the workflow for web scraping.
48+
49+
Returns:
50+
BaseGraph: A graph instance representing the web scraping workflow.
51+
"""
52+
53+
fetch_node = FetchNode(
54+
input="url| local_dir",
55+
output=["doc", "link_urls", "img_urls"],
56+
node_config={
57+
"llm_model": self.llm_model,
58+
"force": self.config.get("force", False),
59+
"cut": self.config.get("cut", True),
60+
"loader_kwargs": self.config.get("loader_kwargs", {}),
61+
}
62+
)
63+
parse_node = ParseNode(
64+
input="doc",
65+
output=["parsed_doc"],
66+
node_config={
67+
"chunk_size": self.model_token
68+
}
69+
)
70+
search_link_node = SearchLinkNode(
71+
input="doc",
72+
output=["parsed_doc"],
73+
node_config={
74+
"llm_model": self.llm_model,
75+
"chunk_size": self.model_token
76+
}
77+
)
78+
79+
return BaseGraph(
80+
nodes=[
81+
fetch_node,
82+
parse_node,
83+
search_link_node
84+
],
85+
edges=[
86+
(fetch_node, parse_node),
87+
(parse_node, search_link_node)
88+
],
89+
entry_point=fetch_node,
90+
graph_name=self.__class__.__name__
91+
)
92+
93+
def run(self) -> str:
94+
"""
95+
Executes the scraping process and returns the answer to the prompt.
96+
97+
Returns:
98+
str: The answer to the prompt.
99+
"""
100+
101+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
102+
self.final_state, self.execution_info = self.graph.execute(inputs)
103+
104+
return self.final_state.get("parsed_doc", "No answer found.")

scrapegraphai/nodes/search_link_node.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,8 @@ def execute(self, state: dict) -> dict:
6868

6969
self.logger.info(f"--- Executing {self.node_name} Node ---")
7070

71-
# Interpret input keys based on the provided input expression
72-
input_keys = self.get_input_keys(state)
7371

74-
user_prompt = state[input_keys[0]]
75-
parsed_content_chunks = state[input_keys[1]]
72+
parsed_content_chunks = state.get("doc")
7673
output_parser = JsonOutputParser()
7774

7875
relevant_links = []
@@ -86,7 +83,8 @@ def execute(self, state: dict) -> dict:
8683
):
8784
try:
8885
# Primary approach: Regular expression to extract links
89-
links = re.findall(r'(https?://\S+)', chunk.page_content)
86+
links = re.findall(r'https?://[^\s"<>\]]+', str(chunk.page_content))
87+
9088
relevant_links += links
9189
except Exception as e:
9290
# Fallback approach: Using the LLM to extract links
@@ -95,9 +93,6 @@ def execute(self, state: dict) -> dict:
9593
You are a website scraper and you have just scraped the following content from a website.
9694
Content: {content}
9795
98-
You are now tasked with identifying all hyper links within the content that are potentially
99-
relevant to the user task: {user_prompt}
100-
10196
Assume relevance broadly, including any links that might be related or potentially useful
10297
in relation to the task.
10398
@@ -124,9 +119,9 @@ def execute(self, state: dict) -> dict:
124119
)
125120
merge_chain = merge_prompt | self.llm_model | output_parser
126121
answer = merge_chain.invoke(
127-
{"content": chunk.page_content, "user_prompt": user_prompt}
122+
{"content": chunk.page_content}
128123
)
129124
relevant_links += answer
130125

131126
state.update({self.output[0]: relevant_links})
132-
return state
127+
return state

0 commit comments

Comments
 (0)