1- """
2- FetchNodeLevelK Module
3- """
41from typing import List , Optional
52from .base_node import BaseNode
63from ..docloaders import ChromiumLoader
@@ -18,14 +15,21 @@ class FetchNodeLevelK(BaseNode):
1815 (with proxy protection).
1916
2017 Attributes:
21- llm_model : An instance of a language model client, configured for generating answers .
18+ embedder_model : An optional model for embedding the fetched content .
2219 verbose (bool): A flag indicating whether to show print statements during execution.
20+ cache_path (str): Path to cache fetched content.
21+ headless (bool): Whether to run the Chromium browser in headless mode.
22+ loader_kwargs (dict): Additional arguments for the content loader.
23+ browser_base (dict): Optional configuration for the browser base API.
24+ depth (int): Maximum depth of hyperlink graph traversal.
25+ only_inside_links (bool): Whether to fetch only internal links.
26+ min_input_len (int): Minimum required length of input data.
2327
2428 Args:
2529 input (str): Boolean expression defining the input keys needed from the state.
2630 output (List[str]): List of output keys to be updated in the state.
2731 node_config (dict): Additional configuration for the node.
28- node_name (str): The unique identifier name for the node, defaulting to "Parse ".
32+ node_name (str): The unique identifier name for the node, defaulting to "FetchLevelK ".
2933 """
3034
3135 def __init__ (
@@ -35,81 +39,68 @@ def __init__(
3539 node_config : Optional [dict ] = None ,
3640 node_name : str = "FetchLevelK" ,
3741 ):
42+ """
43+ Initializes the FetchNodeLevelK instance.
44+
45+ Args:
46+ input (str): Boolean expression defining the input keys needed from the state.
47+ output (List[str]): List of output keys to be updated in the state.
48+ node_config (Optional[dict]): Additional configuration for the node.
49+ node_name (str): The name of the node (default is "FetchLevelK").
50+ """
3851 super ().__init__ (node_name , "node" , input , output , 2 , node_config )
39-
52+
4053 self .embedder_model = node_config .get ("embedder_model" , None )
41-
42- self .verbose = (
43- False if node_config is None else node_config .get ("verbose" , False )
44- )
45-
54+ self .verbose = node_config .get ("verbose" , False ) if node_config else False
4655 self .cache_path = node_config .get ("cache_path" , False )
47-
48- self .headless = (
49- True if node_config is None else node_config .get ("headless" , True )
50- )
51-
52- self .loader_kwargs = (
53- {} if node_config is None else node_config .get ("loader_kwargs" , {})
54- )
55-
56- self .browser_base = (
57- None if node_config is None else node_config .get ("browser_base" , None )
58- )
59-
60- self .depth = (
61- 1 if node_config is None else node_config .get ("depth" , 1 )
62- )
63-
64- self .only_inside_links = (
65- False if node_config is None else node_config .get ("only_inside_links" , False )
66- )
67-
56+ self .headless = node_config .get ("headless" , True ) if node_config else True
57+ self .loader_kwargs = node_config .get ("loader_kwargs" , {}) if node_config else {}
58+ self .browser_base = node_config .get ("browser_base" , None )
59+ self .depth = node_config .get ("depth" , 1 ) if node_config else 1
60+ self .only_inside_links = node_config .get ("only_inside_links" , False ) if node_config else False
6861 self .min_input_len = 1
6962
7063 def execute (self , state : dict ) -> dict :
7164 """
72- Executes the node's logic to fetch the HTML content of a specified URL and all its sub-links
73- and update the graph's state with the content.
65+ Executes the node's logic to fetch the HTML content of a specified URL and its sub-links
66+ recursively, then updates the graph's state with the fetched content.
7467
7568 Args:
76- state (dict): The current state of the graph. The input keys will be used
77- to fetch the correct data types from the state.
69+ state (dict): The current state of the graph.
7870
7971 Returns:
8072 dict: The updated state with a new output key containing the fetched HTML content.
8173
8274 Raises:
83- KeyError: If the input key is not found in the state, indicating that the
84- necessary information to perform the operation is missing.
75+ KeyError: If the input key is not found in the state.
8576 """
86-
8777 self .logger .info (f"--- Executing { self .node_name } Node ---" )
88-
89- # Interpret input keys based on the provided input expression
78+
9079 input_keys = self .get_input_keys (state )
91- # Fetching data from the state based on the input keys
9280 input_data = [state [key ] for key in input_keys ]
93-
9481 source = input_data [0 ]
95-
82+
9683 documents = [{"source" : source }]
97-
98- loader_kwargs = {}
84+ loader_kwargs = self .node_config .get ("loader_kwargs" , {}) if self .node_config else {}
9985
100- if self .node_config is not None :
101- loader_kwargs = self .node_config .get ("loader_kwargs" , {})
102-
10386 for _ in range (self .depth ):
10487 documents = self .obtain_content (documents , loader_kwargs )
105-
88+
10689 filtered_documents = [doc for doc in documents if 'document' in doc ]
107-
10890 state .update ({self .output [0 ]: filtered_documents })
109-
11091 return state
111-
92+
11293 def fetch_content (self , source : str , loader_kwargs ) -> Optional [str ]:
94+ """
95+ Fetches the HTML content of a given source URL.
96+
97+ Args:
98+ source (str): The URL to fetch content from.
99+ loader_kwargs (dict): Additional arguments for the content loader.
100+
101+ Returns:
102+ Optional[str]: The fetched HTML content or None if fetching failed.
103+ """
113104 self .logger .info (f"--- (Fetching HTML from: { source } ) ---" )
114105
115106 if self .browser_base is not None :
@@ -119,63 +110,96 @@ def fetch_content(self, source: str, loader_kwargs) -> Optional[str]:
119110 raise ImportError ("""The browserbase module is not installed.
120111 Please install it using `pip install browserbase`.""" )
121112
122- data = browser_base_fetch (self .browser_base .get ("api_key" ),
123- self .browser_base .get ("project_id" ), [source ])
124-
125- document = [Document (page_content = content ,
126- metadata = {"source" : source }) for content in data ]
127-
113+ data = browser_base_fetch (self .browser_base .get ("api_key" ),
114+ self .browser_base .get ("project_id" ), [source ])
115+ document = [Document (page_content = content , metadata = {"source" : source }) for content in data ]
128116 else :
129117 loader = ChromiumLoader ([source ], headless = self .headless , ** loader_kwargs )
130-
131118 document = loader .load ()
132-
133119 return document
134-
120+
135121 def extract_links (self , html_content : str ) -> list :
122+ """
123+ Extracts all hyperlinks from the HTML content.
124+
125+ Args:
126+ html_content (str): The HTML content to extract links from.
127+
128+ Returns:
129+ list: A list of extracted hyperlinks.
130+ """
136131 soup = BeautifulSoup (html_content , 'html.parser' )
137132 links = [link ['href' ] for link in soup .find_all ('a' , href = True )]
138133 self .logger .info (f"Extracted { len (links )} links." )
139134 return links
140-
135+
141136 def get_full_links (self , base_url : str , links : list ) -> list :
137+ """
138+ Converts relative URLs to full URLs based on the base URL.
139+
140+ Args:
141+ base_url (str): The base URL for resolving relative links.
142+ links (list): A list of links to convert.
143+
144+ Returns:
145+ list: A list of full URLs.
146+ """
142147 full_links = []
143148 for link in links :
144149 if self .only_inside_links and link .startswith ("http" ):
145150 continue
146151 full_link = link if link .startswith ("http" ) else urljoin (base_url , link )
147152 full_links .append (full_link )
148153 return full_links
149-
154+
150155 def obtain_content (self , documents : List , loader_kwargs ) -> List :
156+ """
157+ Iterates through documents, fetching and updating content recursively.
158+
159+ Args:
160+ documents (List): A list of documents containing the source URLs.
161+ loader_kwargs (dict): Additional arguments for the content loader.
162+
163+ Returns:
164+ List: The updated list of documents with fetched content.
165+ """
151166 new_documents = []
152167 for doc in documents :
153168 source = doc ['source' ]
154169 if 'document' not in doc :
155170 document = self .fetch_content (source , loader_kwargs )
156-
171+
157172 if not document or not document [0 ].page_content .strip ():
158173 self .logger .warning (f"Failed to fetch content for { source } " )
159174 documents .remove (doc )
160175 continue
161-
162- #doc['document'] = document[0].page_content
176+
163177 doc ['document' ] = document
164-
165178 links = self .extract_links (doc ['document' ][0 ].page_content )
166179 full_links = self .get_full_links (source , links )
167-
168- # Check if the links are already present in other documents
180+
169181 for link in full_links :
170- # Check if any document is from the same link
171182 if not any (d .get ('source' , '' ) == link for d in documents ) and not any (d .get ('source' , '' ) == link for d in new_documents ):
172- # Add the document
173183 new_documents .append ({"source" : link })
174-
184+
175185 documents .extend (new_documents )
176186 return documents
177-
178- def process_links (self , base_url : str , links : list , loader_kwargs , depth : int , current_depth : int = 1 ) -> dict :
187+
188+ def process_links (self , base_url : str , links : list ,
189+ loader_kwargs , depth : int , current_depth : int = 1 ) -> dict :
190+ """
191+ Processes a list of links recursively up to a given depth.
192+
193+ Args:
194+ base_url (str): The base URL for resolving relative links.
195+ links (list): A list of links to process.
196+ loader_kwargs (dict): Additional arguments for the content loader.
197+ depth (int): The maximum depth for recursion.
198+ current_depth (int): The current depth of recursion (default is 1).
199+
200+ Returns:
201+ dict: A dictionary containing processed link content.
202+ """
179203 content_dict = {}
180204 for idx , link in enumerate (links , start = 1 ):
181205 full_link = link if link .startswith ("http" ) else urljoin (base_url , link )
@@ -184,7 +208,7 @@ def process_links(self, base_url: str, links: list, loader_kwargs, depth: int, c
184208
185209 if current_depth < depth :
186210 new_links = self .extract_links (link_content )
187- content_dict .update (self .process_links (full_link , new_links , depth , current_depth + 1 ))
211+ content_dict .update (self .process_links (full_link , new_links , loader_kwargs , depth , current_depth + 1 ))
188212 else :
189213 self .logger .warning (f"Failed to fetch content for { full_link } " )
190- return content_dict
214+ return content_dict
0 commit comments