11"""
2- Module for fetching the HTML node
2+ Module for checking if a website is scrapepable or not
33"""
4- import warnings
54from typing import List
65from urllib .parse import urlparse
76from langchain_community .document_loaders import AsyncHtmlLoader
1312
1413class RobotsNode (BaseNode ):
1514 """
16- A node responsible for fetching the HTML content of a specified URL and updating
17- the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous
15+ A node responsible for checking if a website is scrapepable or not.
16+ It uses the AsyncHtmlLoader for asynchronous
1817 document loading.
1918
2019 This node acts as a starting point in many scraping workflows, preparing the state
2120 with the necessary HTML content for further processing by subsequent nodes in the graph.
2221
22+ Attributes:
23+ This node acts as a starting point in many scraping workflows, preparing the state
24+ with the necessary HTML content for further processing by subsequent nodes in the graph.
25+
2326 Attributes:
2427 node_name (str): The unique identifier name for the node.
2528 node_type (str): The type of the node, defaulting to "node". This categorization
@@ -31,6 +34,11 @@ class RobotsNode(BaseNode):
3134 reference the node within the graph.
3235 node_type (str, optional): The type of the node, limited to "node" or
3336 "conditional_node". Defaults to "node".
37+ node_config (dict): Configuration parameters for the node.
38+ force_scraping (bool): A flag indicating whether scraping should be enforced even
39+ if disallowed by robots.txt. Defaults to True.
40+ input (str): Input expression defining how to interpret the incoming data.
41+ output (List[str]): List of output keys where the results will be stored.
3442
3543 Methods:
3644 execute(state): Fetches the HTML content for the URL specified in the state and
@@ -39,15 +47,24 @@ class RobotsNode(BaseNode):
3947 to succeed.
4048 """
4149
42- def __init__ (self , input : str , output : List [str ], node_config : dict ,
50+ def __init__ (self , input : str , output : List [str ], node_config : dict , force_scraping = True ,
4351 node_name : str = "Robots" ):
4452 """
45- Initializes the FetchHTMLNode with a node name and node type.
53+ Initializes the RobotsNode with a node name, input/output expressions
54+ and node configuration.
55+
4656 Arguments:
47- node_name (str): name of the node
57+ input (str): Input expression defining how to interpret the incoming data.
58+ output (List[str]): List of output keys where the results will be stored.
59+ node_config (dict): Configuration parameters for the node.
60+ force_scraping (bool): A flag indicating whether scraping should be enforced even
61+ if disallowed by robots.txt. Defaults to True.
62+ node_name (str, optional): The unique identifier name for the node.
63+ Defaults to "Robots".
4864 """
4965 super ().__init__ (node_name , "node" , input , output , 1 )
5066 self .llm_model = node_config ["llm" ]
67+ self .force_scraping = force_scraping
5168
5269 def execute (self , state ):
5370 """
@@ -86,11 +103,10 @@ def execute(self, state):
86103
87104 source = input_data [0 ]
88105 output_parser = CommaSeparatedListOutputParser ()
89- # if it is a local directory
90106 if not source .startswith ("http" ):
91107 raise ValueError (
92108 "Operation not allowed" )
93- # if it is a URL
109+
94110 else :
95111 parsed_url = urlparse (source )
96112 base_url = f"{ parsed_url .scheme } ://{ parsed_url .netloc } "
@@ -119,8 +135,11 @@ def execute(self, state):
119135 is_scrapable = chain .invoke ({"path" : source })[0 ]
120136 print (f"Is the provided URL scrapable? { is_scrapable } " )
121137 if "no" in is_scrapable :
122- warnings .warn ("Scraping this website is not allowed" )
138+ print ("\033 [33mScraping this website is not allowed\033 [0m" )
139+ if not self .force_scraping :
140+ return {"update" : "block the scraping phase" }
141+ else :
142+ print ("\033 [92mThe path is scrapable\033 [0m" )
123143
124- # Update the state with the generated answer
125144 state .update ({self .output [0 ]: is_scrapable })
126145 return state
0 commit comments