Skip to content

Commit 6e7283e

Browse files
committed
feat: add finalize_node()
1 parent e778d27 commit 6e7283e

File tree

9 files changed

+241
-30434
lines changed

9 files changed

+241
-30434
lines changed

examples/custom_graph_domtree.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
"""
2+
Example of custom graph using existing nodes
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.models import OpenAI
8+
from scrapegraphai.graphs import BaseGraph
9+
from scrapegraphai.nodes import FetchNode, GenerateAnswerNode
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
openai_key = os.getenv("OPENAI_APIKEY")
17+
18+
graph_config = {
19+
"llm": {
20+
"api_key": openai_key,
21+
"model": "gpt-3.5-turbo",
22+
"temperature": 0,
23+
"streaming": True
24+
},
25+
}
26+
27+
# ************************************************
28+
# Define the graph nodes
29+
# ************************************************
30+
31+
llm_model = OpenAI(graph_config["llm"])
32+
33+
# define the nodes for the graph
34+
fetch_node = FetchNode(
35+
input="url | local_dir",
36+
output=["doc"],
37+
)
38+
generate_answer_node = GenerateAnswerNode(
39+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
40+
output=["answer"],
41+
node_config={"llm": llm_model},
42+
)
43+
44+
# ************************************************
45+
# Create the graph by defining the connections
46+
# ************************************************
47+
48+
graph = BaseGraph(
49+
nodes={
50+
fetch_node,
51+
generate_answer_node,
52+
},
53+
edges={
54+
(fetch_node, generate_answer_node)
55+
},
56+
entry_point=fetch_node
57+
)
58+
59+
# ************************************************
60+
# Execute the graph
61+
# ************************************************
62+
63+
subtree_text = '''
64+
div>div -> "This is a paragraph" \n
65+
div>ul>li>a>span -> "This is a list item 1" \n
66+
div>ul>li>a>span -> "This is a list item 2" \n
67+
div>ul>li>a>span -> "This is a list item 3"
68+
'''
69+
70+
subtree_simplified_html = '''
71+
<div>
72+
<div>This is a paragraph</div>
73+
<ul>
74+
<li>
75+
<span>This is a list item 1</span>
76+
</li>
77+
<li>
78+
<span>This is a list item 2</span>
79+
</li>
80+
<li>
81+
<span>This is a list item 3</span>
82+
</li>
83+
</ul>
84+
</div>
85+
'''
86+
87+
subtree_dict_simple = {
88+
"div": {
89+
"text": {
90+
"content": "This is a paragraph",
91+
"path_to_fork": "div>div",
92+
},
93+
"ul": {
94+
"path_to_fork": "div>ul",
95+
"texts": [
96+
{
97+
"content": "This is a list item 1",
98+
"path_to_fork": "ul>li>a>span",
99+
},
100+
{
101+
"content": "This is a list item 2",
102+
"path_to_fork": "ul>li>a>span",
103+
},
104+
{
105+
"content": "This is a list item 3",
106+
"path_to_fork": "ul>li>a>span",
107+
}
108+
]
109+
}
110+
}
111+
}
112+
113+
114+
subtree_dict_complex = {
115+
"div": {
116+
"text": {
117+
"content": "This is a paragraph",
118+
"path_to_fork": "div>div",
119+
"attributes": {
120+
"classes": ["paragraph"],
121+
"ids": ["paragraph"],
122+
"hrefs": ["https://www.example.com"]
123+
}
124+
},
125+
"ul": {
126+
"text1":{
127+
"content": "This is a list item 1",
128+
"path_to_fork": "ul>li>a>span",
129+
"attributes": {
130+
"classes": ["list-item", "item-1"],
131+
"ids": ["item-1"],
132+
"hrefs": ["https://www.example.com"]
133+
}
134+
},
135+
"text2":{
136+
"content": "This is a list item 2",
137+
"path_to_fork": "ul>li>a>span",
138+
"attributes": {
139+
"classes": ["list-item", "item-2"],
140+
"ids": ["item-2"],
141+
"hrefs": ["https://www.example.com"]
142+
}
143+
}
144+
}
145+
}
146+
}
147+
148+
result, execution_info = graph.execute({
149+
"user_prompt": "How many list items are there in the document?",
150+
"local_dir": str(subtree_dict_simple)
151+
})
152+
153+
# get the answer from the result
154+
result = result.get("answer", "No answer found.")
155+
print(result)

examples/domtree_example.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,21 +46,34 @@ def print_matches_side_by_side(matches):
4646
# Usage example:
4747
# *********************************************************************************************************************
4848

49-
loader = AsyncHtmlLoader('https://www.wired.com/category/science/')
49+
loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
5050
document = loader.load()
5151
html_content = document[0].page_content
5252

5353
curr_time = time.time()
5454
# Instantiate a DOMTree with HTML content
5555
dom_tree = DOMTree(html_content)
56-
nodes, metadatas = dom_tree.collect_text_nodes() # Collect text nodes for analysis
57-
for node, metadata in zip(nodes, metadatas):
58-
print("Text:", node)
59-
print("Metadata:", metadata)
56+
# nodes, metadatas = dom_tree.collect_text_nodes() # Collect text nodes for analysis
57+
# for node, metadata in zip(nodes, metadatas):
58+
# print("Text:", node)
59+
# print("Metadata:", metadata)
6060

61+
# sub_list = dom_tree.generate_subtree_dicts() # Generate subtree dictionaries for analysis
62+
# print(sub_list)
6163
# graph = dom_tree.visualize(exclude_tags=['script', 'style', 'meta', 'link'])
62-
# subtrees = dom_tree.get_subtrees() # Retrieve subtrees rooted at fork nodes
64+
subtrees = dom_tree.get_subtrees() # Retrieve subtrees rooted at fork nodes
65+
print("Number of subtrees found:", len(subtrees))
6366

67+
# remove trees whos root node does not lead to any text
68+
text_subtrees = [subtree for subtree in subtrees if subtree.root.leads_to_text]
69+
print("Number of subtrees that lead to text:", len(text_subtrees))
70+
71+
direct_leaf_subtrees = [subtree for subtree in text_subtrees if subtree.root.has_direct_leaves]
72+
print("Number of subtrees with direct leaves beneath fork nodes:", len(direct_leaf_subtrees))
73+
74+
for subtree in direct_leaf_subtrees:
75+
print("Subtree rooted at:", subtree.root.value)
76+
subtree.traverse(lambda node: print(node))
6477
# Index subtrees by structure and content
6578
# structure_index, content_index = index_subtrees(subtrees)
6679

@@ -83,4 +96,4 @@ def print_matches_side_by_side(matches):
8396
# print("Subtree rooted at:", subtree.root.value)
8497
# subtree.traverse(lambda node: print(node))
8598
# Traverse the DOMTree and print each node
86-
# dom_tree.traverse(lambda node: print(node))
99+
# dom_tree.traverse(lambda node: print(node))

scrapegraphai/asdt/dom_tree.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ def build_dom_tree(self, soup_node, tree_node):
1515
elif isinstance(child, NavigableString):
1616
text = child.strip()
1717
if text:
18-
tree_node.add_child(TreeNode(value='text', attributes={'content': text}))
18+
new_node = TreeNode(value='text', attributes={'content': text})
19+
tree_node.add_child(new_node)
20+
new_node.finalize_node()
1921
elif isinstance(child, Tag):
2022
new_node = TreeNode(value=child.name, attributes=child.attrs)
2123
tree_node.add_child(new_node)

scrapegraphai/asdt/tree.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,42 @@ def get_subtrees(self):
1616
# Retrieves all subtrees rooted at fork nodes
1717
return self.root.get_subtrees() if self.root else []
1818

19+
def generate_subtree_dicts(self):
20+
subtree_dicts = []
21+
22+
def aggregate_text_under_fork(fork_node):
23+
text_aggregate = {
24+
"content": [],
25+
"path_to_fork": ""
26+
}
27+
for child in fork_node.children:
28+
if child.value == 'text':
29+
text_aggregate["content"].append(child.attributes['content'])
30+
elif child.is_fork:
31+
continue
32+
else:
33+
for sub_child in child.children:
34+
text_aggregate["content"].append(sub_child.attributes)
35+
36+
text_aggregate["path_to_fork"] = fork_node.closest_fork_path
37+
return text_aggregate
38+
39+
def process_node(node):
40+
if node.is_fork:
41+
texts = aggregate_text_under_fork(node)
42+
if texts["content"]: # Only add if there's text content
43+
subtree_dicts.append({
44+
node.value: {
45+
"text": texts,
46+
"path_to_fork": texts["path_to_fork"],
47+
}
48+
})
49+
for child in node.children:
50+
process_node(child)
51+
52+
process_node(self.root)
53+
return subtree_dicts
54+
1955
def visualize(self, exclude_tags = ['script']):
2056
def add_nodes_edges(tree_node, graph):
2157
if tree_node:
@@ -49,7 +85,7 @@ def add_nodes_edges(tree_node, graph):
4985

5086
# Initialize Digraph, set graph and node attributes
5187
graph = Digraph()
52-
graph.attr(size='10,10', dpi='300') # Set higher DPI for better image resolution
88+
# graph.attr(size='10,10', dpi='300') # Set higher DPI for better image resolution
5389
graph.attr('node', style='filled', fontname='Helvetica')
5490
graph.attr('edge', fontname='Helvetica')
5591

scrapegraphai/asdt/tree_node.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@ def __init__(self, value=None, attributes=None, children=None, parent=None, dept
77
self.children = children if children is not None else []
88
self.parent = parent
99
self.depth = depth
10+
# Flag to track if the subtree leads to text
1011
self.leads_to_text = False
12+
# Flags to track if the subtree has a direct leaf node
13+
self.has_direct_leaves = False
1114
self.root_path = self._compute_root_path()
1215
self.closest_fork_path = self._compute_fork_path()
1316
self.structure_hash = None
@@ -54,14 +57,26 @@ def _compute_fork_path(self):
5457
current = current.parent
5558
path.append(current.value) # Add the fork or root node
5659
return '>'.join(reversed(path))
57-
58-
def get_subtrees(self):
60+
61+
def finalize_node(self):
62+
if self.is_text and self.is_leaf:
63+
self.update_direct_leaves_flag()
64+
65+
def update_direct_leaves_flag(self):
66+
ancestor = self.parent
67+
while ancestor and len(ancestor.children) == 1:
68+
ancestor = ancestor.parent
69+
if ancestor and ancestor.is_fork:
70+
ancestor.has_direct_leaves = True
71+
72+
def get_subtrees(self, direct_leaves=False):
5973
# This method finds and returns subtrees rooted at this node and all descendant forks
74+
# Optionally filters to include only those with direct leaves beneath fork nodes
6075
subtrees = []
61-
if self.is_fork:
76+
if self.is_fork and (not direct_leaves or self.has_direct_leaves):
6277
subtrees.append(Tree(root=self))
6378
for child in self.children:
64-
subtrees.extend(child.get_subtrees())
79+
subtrees.extend(child.get_subtrees(direct_leaves=direct_leaves))
6580
return subtrees
6681

6782
def hash_subtree_structure(self, node):
@@ -84,12 +99,16 @@ def get_all_text(self, node):
8499
return text
85100

86101
def __repr__(self):
87-
return f"TreeNode(value={self.value}, leads_to_text={self.leads_to_text}, depth={self.depth}, root_path={self.root_path}, closest_fork_path={self.closest_fork_path})"
102+
return f"TreeNode(value={self.value}, leads_to_text={self.leads_to_text}, is_fork={self.is_fork})"
88103

89104
@property
90105
def is_fork(self):
91106
return len(self.children) > 1
92107

93108
@property
94109
def is_leaf(self):
95-
return len(self.children) == 0
110+
return len(self.children) == 0
111+
112+
@property
113+
def is_text(self):
114+
return self.value == 'text'

scrapegraphai/nodes/fetch_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def execute(self, state):
7272

7373
# if it is a local directory
7474
if not source.startswith("http"):
75-
compressedDocument = [Document(page_content=remover(source), metadata={
75+
compressedDocument = [Document(page_content=source, metadata={
7676
"source": "local_dir"
7777
})]
7878

0 commit comments

Comments
 (0)