Skip to content

Commit 3e3e1b2

Browse files
committed
feat: Implement SmartScraperMultiParseMergeFirstGraph class that scrapes a list of URLs and merge the content first and finally generates answers to a given prompt.
(Different from the SmartScraperMultiGraph is that in this case the content is merged before to be processed by the llm.)
1 parent 612c644 commit 3e3e1b2

File tree

2 files changed

+105
-0
lines changed

2 files changed

+105
-0
lines changed

scrapegraphai/graphs/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,3 +25,5 @@
2525
from .smart_scraper_multi_concat_graph import SmartScraperMultiConcatGraph
2626
from .code_generator_graph import CodeGeneratorGraph
2727
from .depth_search_graph import DepthSearchGraph
28+
from .smart_scraper_multi_parse_merge_first_graph import SmartScraperMultiParseMergeFirstGraph
29+
from .scrape_graph import ScrapeGraph
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
"""
2+
SmartScraperMultiGraph Module
3+
"""
4+
from copy import deepcopy
5+
from typing import List, Optional
6+
from pydantic import BaseModel
7+
from .base_graph import BaseGraph
8+
from .abstract_graph import AbstractGraph
9+
from .scrape_graph import ScrapeGraph
10+
from ..nodes import (
11+
GraphIteratorNode,
12+
MergeAnswersNode,
13+
)
14+
from ..utils.copy import safe_deepcopy
15+
16+
class SmartScraperMultiParseMergeFirstGraph(AbstractGraph):
17+
"""
18+
SmartScraperMultiParseMergeFirstGraph is a scraping pipeline that scrapes a
19+
list of URLs and merge the content first and finally generates answers to a given prompt.
20+
It only requires a user prompt and a list of URLs.
21+
The difference with the SmartScraperMultiGraph is that in this case the content is merged
22+
before to be passed to the llm.
23+
24+
Attributes:
25+
prompt (str): The user prompt to search the internet.
26+
llm_model (dict): The configuration for the language model.
27+
embedder_model (dict): The configuration for the embedder model.
28+
headless (bool): A flag to run the browser in headless mode.
29+
verbose (bool): A flag to display the execution information.
30+
model_token (int): The token limit for the language model.
31+
32+
Args:
33+
prompt (str): The user prompt to search the internet.
34+
source (List[str]): The source of the graph.
35+
config (dict): Configuration parameters for the graph.
36+
schema (Optional[BaseModel]): The schema for the graph output.
37+
38+
Example:
39+
>>> search_graph = SmartScraperMultiParseMergeFirstGraph(
40+
... prompt="Who is Marco Perini?",
41+
... source= [
42+
... "https://perinim.github.io/",
43+
... "https://perinim.github.io/cv/"
44+
... ],
45+
... config={"llm": {"model": "openai/gpt-3.5-turbo"}}
46+
... )
47+
>>> result = search_graph.run()
48+
"""
49+
50+
def __init__(self, prompt: str, source: List[str],
51+
config: dict, schema: Optional[BaseModel] = None):
52+
53+
self.copy_config = safe_deepcopy(config)
54+
self.copy_schema = deepcopy(schema)
55+
super().__init__(prompt, config, source, schema)
56+
57+
def _create_graph(self) -> BaseGraph:
58+
"""
59+
Creates the graph of nodes representing the workflow for web scraping
60+
and parsing and then merge the content and generates answers to a given prompt.
61+
"""
62+
graph_iterator_node = GraphIteratorNode(
63+
input="user_prompt & urls",
64+
output=["parsed_doc"],
65+
node_config={
66+
"graph_instance": ScrapeGraph,
67+
"scraper_config": self.copy_config,
68+
},
69+
schema=self.copy_schema
70+
)
71+
72+
merge_answers_node = MergeAnswersNode(
73+
input="user_prompt & parsed_doc",
74+
output=["answer"],
75+
node_config={
76+
"llm_model": self.llm_model,
77+
"schema": self.copy_schema
78+
}
79+
)
80+
81+
return BaseGraph(
82+
nodes=[
83+
graph_iterator_node,
84+
merge_answers_node,
85+
],
86+
edges=[
87+
(graph_iterator_node, merge_answers_node),
88+
],
89+
entry_point=graph_iterator_node,
90+
graph_name=self.__class__.__name__
91+
)
92+
93+
def run(self) -> str:
94+
"""
95+
Executes the web scraping and parsing process first and
96+
then concatenate the content and generates answers to a given prompt.
97+
98+
Returns:
99+
str: The answer to the prompt.
100+
"""
101+
inputs = {"user_prompt": self.prompt, "urls": self.source}
102+
self.final_state, self.execution_info = self.graph.execute(inputs)
103+
return self.final_state.get("answer", "No answer found.")

0 commit comments

Comments
 (0)