Skip to content

Commit e68b707

Browse files
committed
changed color and return type
1 parent ae49dee commit e68b707

File tree

2 files changed

+33
-12
lines changed

2 files changed

+33
-12
lines changed

scrapegraphai/graphs/base_graph.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import time
55
from langchain_community.callbacks import get_openai_callback
66

7+
78
class BaseGraph:
89
"""
910
BaseGraph manages the execution flow of a graph composed of interconnected nodes.
@@ -81,7 +82,8 @@ def execute(self, initial_state: dict) -> dict:
8182

8283
with get_openai_callback() as cb:
8384
result = current_node.execute(state)
84-
85+
# ADd the check for the node RObots
86+
8587
node_exec_time = time.time() - curr_time
8688
total_exec_time += node_exec_time
8789

scrapegraphai/nodes/robots_node.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""
2-
Module for fetching the HTML node
2+
Module for checking if a website is scrapepable or not
33
"""
4-
import warnings
54
from typing import List
65
from urllib.parse import urlparse
76
from langchain_community.document_loaders import AsyncHtmlLoader
@@ -13,13 +12,17 @@
1312

1413
class RobotsNode(BaseNode):
1514
"""
16-
A node responsible for fetching the HTML content of a specified URL and updating
17-
the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous
15+
A node responsible for checking if a website is scrapepable or not.
16+
It uses the AsyncHtmlLoader for asynchronous
1817
document loading.
1918
2019
This node acts as a starting point in many scraping workflows, preparing the state
2120
with the necessary HTML content for further processing by subsequent nodes in the graph.
2221
22+
Attributes:
23+
This node acts as a starting point in many scraping workflows, preparing the state
24+
with the necessary HTML content for further processing by subsequent nodes in the graph.
25+
2326
Attributes:
2427
node_name (str): The unique identifier name for the node.
2528
node_type (str): The type of the node, defaulting to "node". This categorization
@@ -31,6 +34,11 @@ class RobotsNode(BaseNode):
3134
reference the node within the graph.
3235
node_type (str, optional): The type of the node, limited to "node" or
3336
"conditional_node". Defaults to "node".
37+
node_config (dict): Configuration parameters for the node.
38+
force_scraping (bool): A flag indicating whether scraping should be enforced even
39+
if disallowed by robots.txt. Defaults to True.
40+
input (str): Input expression defining how to interpret the incoming data.
41+
output (List[str]): List of output keys where the results will be stored.
3442
3543
Methods:
3644
execute(state): Fetches the HTML content for the URL specified in the state and
@@ -39,15 +47,24 @@ class RobotsNode(BaseNode):
3947
to succeed.
4048
"""
4149

42-
def __init__(self, input: str, output: List[str], node_config: dict,
50+
def __init__(self, input: str, output: List[str], node_config: dict, force_scraping=True,
4351
node_name: str = "Robots"):
4452
"""
45-
Initializes the FetchHTMLNode with a node name and node type.
53+
Initializes the RobotsNode with a node name, input/output expressions
54+
and node configuration.
55+
4656
Arguments:
47-
node_name (str): name of the node
57+
input (str): Input expression defining how to interpret the incoming data.
58+
output (List[str]): List of output keys where the results will be stored.
59+
node_config (dict): Configuration parameters for the node.
60+
force_scraping (bool): A flag indicating whether scraping should be enforced even
61+
if disallowed by robots.txt. Defaults to True.
62+
node_name (str, optional): The unique identifier name for the node.
63+
Defaults to "Robots".
4864
"""
4965
super().__init__(node_name, "node", input, output, 1)
5066
self.llm_model = node_config["llm"]
67+
self.force_scraping = force_scraping
5168

5269
def execute(self, state):
5370
"""
@@ -86,11 +103,10 @@ def execute(self, state):
86103

87104
source = input_data[0]
88105
output_parser = CommaSeparatedListOutputParser()
89-
# if it is a local directory
90106
if not source.startswith("http"):
91107
raise ValueError(
92108
"Operation not allowed")
93-
# if it is a URL
109+
94110
else:
95111
parsed_url = urlparse(source)
96112
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
@@ -119,8 +135,11 @@ def execute(self, state):
119135
is_scrapable = chain.invoke({"path": source})[0]
120136
print(f"Is the provided URL scrapable? {is_scrapable}")
121137
if "no" in is_scrapable:
122-
warnings.warn("Scraping this website is not allowed")
138+
print("\033[33mScraping this website is not allowed\033[0m")
139+
if not self.force_scraping:
140+
return {"update": "block the scraping phase"}
141+
else:
142+
print("\033[92mThe path is scrapable\033[0m")
123143

124-
# Update the state with the generated answer
125144
state.update({self.output[0]: is_scrapable})
126145
return state

0 commit comments

Comments
 (0)