Skip to content

Commit ae69e4b

Browse files
committed
dom tree structure
1 parent 134c94e commit ae69e4b

File tree

2 files changed

+175
-33
lines changed

2 files changed

+175
-33
lines changed

scrapegraphai/utils/aaa.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
from bs4 import BeautifulSoup
2+
from bs4.element import Tag, NavigableString, Comment
3+
from langchain_community.document_loaders import AsyncHtmlLoader
4+
import time
5+
6+
class TreeNode:
7+
def __init__(self, value=None, attributes=None, children=None, parent=None):
8+
self.value = value
9+
self.attributes = attributes if attributes is not None else {}
10+
self.children = children if children is not None else []
11+
self.parent = parent
12+
self.leads_to_text = False # Initialize the flag as False
13+
self.root_path = self._compute_root_path()
14+
self.closest_fork_path = self._compute_fork_path()
15+
16+
def add_child(self, child_node):
17+
child_node.parent = self
18+
self.children.append(child_node)
19+
child_node.update_paths()
20+
self.update_leads_to_text() # Update this node if the child leads to text
21+
22+
def update_paths(self):
23+
self.root_path = self._compute_root_path()
24+
self.closest_fork_path = self._compute_fork_path()
25+
26+
def update_leads_to_text(self):
27+
# Check if any child leads to text or is a text node
28+
if any(child.value == 'text' or child.leads_to_text for child in self.children):
29+
self.leads_to_text = True
30+
# Update the flag up the tree
31+
if self.parent and not self.parent.leads_to_text:
32+
self.parent.update_leads_to_text()
33+
34+
def _compute_root_path(self):
35+
path = []
36+
current = self
37+
while current.parent:
38+
path.append(current.value)
39+
current = current.parent
40+
path.append('root') # Append 'root' to start of the path
41+
return '>'.join(reversed(path))
42+
43+
def _compute_fork_path(self):
44+
path = []
45+
current = self
46+
while current.parent and len(current.parent.children) == 1:
47+
path.append(current.value)
48+
current = current.parent
49+
path.append(current.value) # Add the fork or root node
50+
return '>'.join(reversed(path))
51+
52+
def __repr__(self):
53+
return f"TreeNode(value={self.value}, leads_to_text={self.leads_to_text}, root_path={self.root_path}, closest_fork_path={self.closest_fork_path})"
54+
55+
@property
56+
def is_fork(self):
57+
return len(self.children) > 1
58+
59+
@property
60+
def is_leaf(self):
61+
return len(self.children) == 0
62+
63+
class Tree:
64+
def __init__(self, root=None):
65+
self.root = root
66+
67+
def traverse(self, visit_func):
68+
def _traverse(node):
69+
if node:
70+
visit_func(node)
71+
for child in node.children:
72+
_traverse(child)
73+
_traverse(self.root)
74+
75+
def __repr__(self):
76+
return f"Tree(root={self.root})"
77+
78+
79+
class DOMTree(Tree):
80+
def __init__(self, html_content):
81+
super().__init__()
82+
self.root = TreeNode('document')
83+
self.build_dom_tree(BeautifulSoup(html_content, 'html.parser'), self.root)
84+
85+
def build_dom_tree(self, soup_node, tree_node):
86+
for child in soup_node.children:
87+
if isinstance(child, Comment):
88+
continue # Skip comments
89+
elif isinstance(child, NavigableString):
90+
text = child.strip()
91+
if text:
92+
# Create a text node with value 'text' and the actual content under 'content' key
93+
tree_node.add_child(TreeNode(value='text', attributes={'content': text}))
94+
elif isinstance(child, Tag):
95+
new_node = TreeNode(value=child.name, attributes=child.attrs)
96+
tree_node.add_child(new_node)
97+
self.build_dom_tree(child, new_node)
98+
99+
# Usage example:
100+
101+
loader = AsyncHtmlLoader('https://www.mymovies.it/cinema/roma/')
102+
document = loader.load()
103+
html_content = document[0].page_content
104+
105+
curr_time = time.time()
106+
# Instantiate a DOMTree with HTML content
107+
dom_tree = DOMTree(html_content)
108+
print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds")
109+
110+
# Traverse the DOMTree and print each node
111+
# dom_tree.traverse(lambda node: print(node))

scrapegraphai/utils/asdt.py

Lines changed: 64 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,54 @@
11
from bs4 import BeautifulSoup, NavigableString
22
from graphviz import Digraph
33
from langchain_community.document_loaders import AsyncHtmlLoader
4+
import json
5+
from bs4 import BeautifulSoup, NavigableString, Comment
6+
import time
47

5-
def tag_structure(tag, include_scripts=True):
8+
def tag_structure(tag, exclude=None):
69
"""
7-
Recursively get a tag's structure, including its attributes, children, and textual content.
10+
Recursively get a tag's structure, including its attributes, children, and textual content,
11+
with an option to exclude specific tags. Text is treated as separate nodes.
12+
813
:param tag: BeautifulSoup tag object
9-
:param include_scripts: Include or exclude <script> tags from the structure
10-
:return: A dict with the tag's name, attributes, children, and text
14+
:param exclude: List of tag names to exclude from the structure
15+
:return: A dict with the tag's name, attributes, children, and text nodes
1116
"""
12-
if isinstance(tag, NavigableString):
13-
return tag.strip() if tag.strip() else None
17+
if exclude is None:
18+
exclude = []
19+
20+
if isinstance(tag, Comment):
21+
return None # Ignore comments
1422

15-
# Skip script tags if include_scripts is False
16-
if not include_scripts and tag.name == 'script':
17-
return None
23+
if isinstance(tag, NavigableString):
24+
text_content = tag.strip()
25+
if text_content:
26+
text_node = {'text': {
27+
'content': text_content,
28+
'children': []
29+
}
30+
}
31+
return text_node
32+
else:
33+
return None
34+
35+
if tag.name in exclude:
36+
return None # Skip tags specified in the exclude list
1837

1938
tag_info = {
2039
'attrs': dict(tag.attrs),
2140
'children': []
2241
}
2342

2443
for child in tag.children:
25-
child_structure = tag_structure(child, include_scripts=include_scripts)
44+
child_structure = tag_structure(child, exclude=exclude)
2645
if child_structure:
46+
# Append structure or text node to children
2747
tag_info['children'].append(child_structure)
2848

2949
return {tag.name: tag_info}
3050

51+
3152
# Function to recursively traverse the structured HTML dictionary and create graph nodes and edges
3253
def add_nodes_edges(graph, structure, parent=None, include_scripts=True):
3354
if isinstance(structure, dict):
@@ -56,59 +77,69 @@ def add_nodes_edges(graph, structure, parent=None, include_scripts=True):
5677

5778
def has_text_content(structure):
5879
if isinstance(structure, str) and structure.strip():
80+
# If it's a string with non-whitespace characters, it's text content
5981
return True
6082
elif isinstance(structure, dict):
61-
for content in structure.values():
62-
if any(has_text_content(child) for child in content['children']):
63-
return True
83+
84+
for key, value in structure.items():
85+
if isinstance(value, list):
86+
# It's a list, probably of children
87+
if any(has_text_content(child) for child in value):
88+
return True
89+
elif isinstance(value, dict):
90+
# It's a dictionary, need to check recursively
91+
if has_text_content(value):
92+
return True
6493
return False
6594

66-
def add_text_nodes_only(graph, structure, parent=None, include_scripts=True):
95+
def add_text_nodes_only(graph, structure, parent=None):
6796
"""
6897
Recursively traverse the structured HTML dictionary and create graph nodes and edges
69-
for text content only.
98+
for text content only, using Graphviz Digraph object.
7099
:param graph: Graphviz Digraph object
71100
:param structure: Structured HTML dictionary
72101
:param parent: ID of the parent node
73102
:param include_scripts: Include or exclude <script> tags from the visualization
74103
"""
75104
if isinstance(structure, dict):
76105
for tag, content in structure.items():
77-
# Skip script tags if include_scripts is False
78-
if not include_scripts and tag == 'script':
79-
continue
80-
81-
has_text = any(has_text_content(child) for child in content['children'])
82-
if has_text:
106+
107+
if 'text' in content:
108+
# Content is a text node
109+
text_label = (content['text'][:30] + '...') if len(content['text']) > 30 else content['text']
110+
text_node_name = f"text_{id(content)}"
111+
graph.node(text_node_name, label=text_label, shape="plaintext")
112+
if parent:
113+
graph.edge(parent, text_node_name)
114+
else:
115+
# Content is a tag with children
83116
node_name = f"{tag}_{id(content)}"
84117
graph.node(node_name, label=tag)
85118
if parent:
86119
graph.edge(parent, node_name)
87-
for child in content['children']:
88-
add_text_nodes_only(graph, child, parent=node_name, include_scripts=include_scripts)
89-
elif isinstance(structure, str) and structure.strip():
90-
text_label = (structure[:30] + '..') if len(structure) > 30 else structure
91-
text_node_name = f"text_{id(structure)}"
92-
graph.node(text_node_name, label=text_label, shape="plaintext")
93-
if parent:
94-
graph.edge(parent, text_node_name)
120+
for child in content.get('children', []):
121+
add_text_nodes_only(graph, child, parent=node_name)
122+
95123

96-
loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
124+
loader = AsyncHtmlLoader('https://www.mymovies.it/cinema/roma/')
97125
document = loader.load()
98126
html_content = document[0].page_content
127+
128+
curr_time = time.time()
99129
# Parse HTML content
100130
soup = BeautifulSoup(html_content, 'html.parser')
101131

102132
# Generate and print structured HTML
103-
html_structure = tag_structure(soup.find('html'))
104-
# print(structure)
133+
html_structure = tag_structure(soup.find('html'), exclude=['head', 'style', 'script'])
134+
print(f"Time taken to generate structured HTML: {time.time() - curr_time:.2f} seconds")
135+
# print(json.dumps(html_structure, indent=2))
105136

106137
# Create a Digraph object
107138
dot = Digraph()
108139
dot.attr(rankdir='LR') # Left to Right, change to 'TB' for Top to Bottom
109140

110141
# Recursively add nodes and edges based on the structured HTML dictionary
111142
# add_nodes_edges(dot, html_structure, include_scripts=False)
112-
add_text_nodes_only(dot, html_structure, include_scripts=False)
143+
add_text_nodes_only(dot, html_structure)
113144
# Render the graph to a file and view it
114145
dot.render('html_structure', view=True, format='png')

0 commit comments

Comments
 (0)