88 SearchLinkNode ,
99 ParseNode ,
1010 RAGNode ,
11- GenerateAnswerNode
11+ GenerateAnswerNode ,
12+ GraphIteratorNode ,
13+ MergeAnswersNode
1214)
1315from .abstract_graph import AbstractGraph
1416
@@ -18,12 +20,11 @@ class DeepScraperGraph(AbstractGraph):
1820 [WIP]
1921
2022 DeepScraper is a scraping pipeline that automates the process of
21- extracting information from web pages
22- using a natural language model to interpret and answer prompts.
23-
24- Unlike SmartScraper, DeepScraper can navigate to the links within the input webpage,
25- to fuflfil the task within the prompt.
23+ extracting information from web pages using a natural language model
24+ to interpret and answer prompts.
2625
26+ Unlike SmartScraper, DeepScraper can navigate to the links within,
27+ the input webpage to fuflfil the task within the prompt.
2728
2829 Attributes:
2930 prompt (str): The prompt for the graph.
@@ -50,12 +51,13 @@ class DeepScraperGraph(AbstractGraph):
5051
5152 def __init__ (self , prompt : str , source : str , config : dict ):
5253 super ().__init__ (prompt , config , source )
53-
5454 self .input_key = "url" if source .startswith ("http" ) else "local_dir"
5555
56- def _create_graph (self ) -> BaseGraph :
56+ def _create_repeated_graph (self ) -> BaseGraph :
5757 """
58- Creates the graph of nodes representing the workflow for web scraping.
58+ Creates the graph that can be repeatedly executed to conduct search on
59+ hyperlinks within the webpage.
60+
5961 Returns:
6062 BaseGraph: A graph instance representing the web scraping workflow.
6163 """
@@ -78,6 +80,13 @@ def _create_graph(self) -> BaseGraph:
7880 "embedder_model" : self .embedder_model
7981 }
8082 )
83+ generate_answer_node = GenerateAnswerNode (
84+ input = "user_prompt & (relevant_chunks | parsed_doc | doc)" ,
85+ output = ["answer" ],
86+ node_config = {
87+ "llm_model" : self .llm_model
88+ }
89+ )
8190 search_node = SearchLinkNode (
8291 input = "user_prompt & relevant_chunks" ,
8392 output = ["relevant_links" ],
@@ -86,23 +95,60 @@ def _create_graph(self) -> BaseGraph:
8695 "embedder_model" : self .embedder_model
8796 }
8897 )
98+ graph_iterator_node = GraphIteratorNode (
99+ input = "user_prompt & relevant_links" ,
100+ output = ["results" ],
101+ node_config = {
102+ "graph_instance" : None ,
103+ "batchsize" : 1
104+ }
105+ )
106+ merge_answers_node = MergeAnswersNode (
107+ input = "user_prompt & results" ,
108+ output = ["answer" ],
109+ node_config = {
110+ "llm_model" : self .llm_model ,
111+ }
112+ )
89113
90114 return BaseGraph (
91115 nodes = [
92116 fetch_node ,
93117 parse_node ,
94118 rag_node ,
95- search_node
119+ generate_answer_node ,
120+ search_node ,
121+ graph_iterator_node ,
122+ merge_answers_node
96123 ],
97124 edges = [
98125 (fetch_node , parse_node ),
99126 (parse_node , rag_node ),
100- (rag_node , search_node )
101-
127+ (rag_node , generate_answer_node ),
128+ (rag_node , search_node ),
129+ (search_node , graph_iterator_node ),
130+ (graph_iterator_node , merge_answers_node )
102131 ],
103132 entry_point = fetch_node
104133 )
105134
135+
136+
137+ def _create_graph (self ) -> BaseGraph :
138+ """
139+ Creates the graph of nodes representing the workflow for web scraping
140+ n-levels deep.
141+
142+ Returns:
143+ BaseGraph: A graph instance representing the web scraping workflow.
144+ """
145+
146+ base_graph = self ._create_repeated_graph ()
147+ graph_iterator_node = list (filter (lambda x : x .node_name == "GraphIterator" , base_graph .nodes ))[0 ]
148+ # Graph iterator will repeat the same graph for multiple hyperlinks found within input webpage
149+ graph_iterator_node .node_config ["graph_instance" ] = self
150+ return base_graph
151+
106152 def run (self ) -> str :
107153 """
108154 Executes the scraping process and returns the answer to the prompt.
0 commit comments