Skip to content

Commit 134c94e

Browse files
committed
added first asdt implementation
1 parent c032131 commit 134c94e

File tree

3 files changed

+173
-1
lines changed

3 files changed

+173
-1
lines changed

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ pandas = "2.0.3"
3232
python-dotenv = "1.0.1"
3333
tiktoken = {version = ">=0.5.2,<0.6.0"}
3434
tqdm = "4.66.1"
35-
graphviz = "0.20.1"
3635
google = "3.0.0"
3736
minify-html = "0.15.0"
3837

scrapegraphai/utils/asdt.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
from bs4 import BeautifulSoup, NavigableString
2+
from graphviz import Digraph
3+
from langchain_community.document_loaders import AsyncHtmlLoader
4+
5+
def tag_structure(tag, include_scripts=True):
6+
"""
7+
Recursively get a tag's structure, including its attributes, children, and textual content.
8+
:param tag: BeautifulSoup tag object
9+
:param include_scripts: Include or exclude <script> tags from the structure
10+
:return: A dict with the tag's name, attributes, children, and text
11+
"""
12+
if isinstance(tag, NavigableString):
13+
return tag.strip() if tag.strip() else None
14+
15+
# Skip script tags if include_scripts is False
16+
if not include_scripts and tag.name == 'script':
17+
return None
18+
19+
tag_info = {
20+
'attrs': dict(tag.attrs),
21+
'children': []
22+
}
23+
24+
for child in tag.children:
25+
child_structure = tag_structure(child, include_scripts=include_scripts)
26+
if child_structure:
27+
tag_info['children'].append(child_structure)
28+
29+
return {tag.name: tag_info}
30+
31+
# Function to recursively traverse the structured HTML dictionary and create graph nodes and edges
32+
def add_nodes_edges(graph, structure, parent=None, include_scripts=True):
33+
if isinstance(structure, dict):
34+
for tag, content in structure.items():
35+
# Skip script tags if include_scripts is False
36+
if tag == 'script' and not include_scripts:
37+
continue
38+
39+
node_name = f"{tag}_{id(content)}" # Unique node name
40+
graph.node(node_name, label=tag)
41+
if parent:
42+
graph.edge(parent, node_name)
43+
# Recursively process the children nodes
44+
add_nodes_edges(graph, content['children'], parent=node_name, include_scripts=include_scripts)
45+
46+
elif isinstance(structure, list):
47+
for item in structure:
48+
add_nodes_edges(graph, item, parent, include_scripts=include_scripts)
49+
50+
elif isinstance(structure, str) and parent:
51+
# Adding text node with limited length to keep the visualization clean
52+
text_label = (structure[:30] + '..') if len(structure) > 30 else structure
53+
text_node_name = f"text_{id(structure)}"
54+
graph.node(text_node_name, label=text_label, shape="plaintext")
55+
graph.edge(parent, text_node_name)
56+
57+
def has_text_content(structure):
58+
if isinstance(structure, str) and structure.strip():
59+
return True
60+
elif isinstance(structure, dict):
61+
for content in structure.values():
62+
if any(has_text_content(child) for child in content['children']):
63+
return True
64+
return False
65+
66+
def add_text_nodes_only(graph, structure, parent=None, include_scripts=True):
67+
"""
68+
Recursively traverse the structured HTML dictionary and create graph nodes and edges
69+
for text content only.
70+
:param graph: Graphviz Digraph object
71+
:param structure: Structured HTML dictionary
72+
:param parent: ID of the parent node
73+
:param include_scripts: Include or exclude <script> tags from the visualization
74+
"""
75+
if isinstance(structure, dict):
76+
for tag, content in structure.items():
77+
# Skip script tags if include_scripts is False
78+
if not include_scripts and tag == 'script':
79+
continue
80+
81+
has_text = any(has_text_content(child) for child in content['children'])
82+
if has_text:
83+
node_name = f"{tag}_{id(content)}"
84+
graph.node(node_name, label=tag)
85+
if parent:
86+
graph.edge(parent, node_name)
87+
for child in content['children']:
88+
add_text_nodes_only(graph, child, parent=node_name, include_scripts=include_scripts)
89+
elif isinstance(structure, str) and structure.strip():
90+
text_label = (structure[:30] + '..') if len(structure) > 30 else structure
91+
text_node_name = f"text_{id(structure)}"
92+
graph.node(text_node_name, label=text_label, shape="plaintext")
93+
if parent:
94+
graph.edge(parent, text_node_name)
95+
96+
loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
97+
document = loader.load()
98+
html_content = document[0].page_content
99+
# Parse HTML content
100+
soup = BeautifulSoup(html_content, 'html.parser')
101+
102+
# Generate and print structured HTML
103+
html_structure = tag_structure(soup.find('html'))
104+
# print(structure)
105+
106+
# Create a Digraph object
107+
dot = Digraph()
108+
dot.attr(rankdir='LR') # Left to Right, change to 'TB' for Top to Bottom
109+
110+
# Recursively add nodes and edges based on the structured HTML dictionary
111+
# add_nodes_edges(dot, html_structure, include_scripts=False)
112+
add_text_nodes_only(dot, html_structure, include_scripts=False)
113+
# Render the graph to a file and view it
114+
dot.render('html_structure', view=True, format='png')

scrapegraphai/utils/tree_base.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from bs4 import BeautifulSoup, NavigableString
2+
from pyecharts import options as opts
3+
from pyecharts.charts import Tree
4+
from langchain_community.document_loaders import AsyncHtmlLoader
5+
import webbrowser
6+
7+
8+
def tag_structure(tag, include_scripts=True):
9+
if isinstance(tag, NavigableString):
10+
text = tag.strip()
11+
return {"name": text[:30] + "..." if len(text) > 30 else text} if text else None
12+
13+
if not include_scripts and tag.name == 'script':
14+
return None
15+
16+
children = []
17+
for child in tag.children:
18+
child_structure = tag_structure(child, include_scripts=include_scripts)
19+
if child_structure:
20+
children.append(child_structure)
21+
22+
tag_info = {"name": tag.name, "children": children} if children else {"name": tag.name}
23+
return tag_info
24+
25+
def build_tree_data(html_structure):
26+
return [html_structure] if html_structure else []
27+
28+
# Load and parse HTML content
29+
loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
30+
document = loader.load()
31+
html_content = document[0].page_content
32+
soup = BeautifulSoup(html_content, 'html.parser')
33+
34+
# Generate structured HTML
35+
html_structure = tag_structure(soup.find('html'), include_scripts=False)
36+
37+
# Build tree data for pyecharts
38+
tree_data = build_tree_data(html_structure)
39+
40+
# Create a Tree chart
41+
chart = Tree(init_opts=opts.InitOpts(width="100%", height="800px"))
42+
chart.add(
43+
series_name="",
44+
data=tree_data,
45+
initial_tree_depth=-1, # Set to -1 to expand all nodes initially
46+
layout='orthogonal', # Can be 'radial' for radial layout
47+
is_roam=True, # Allows users to zoom and pan
48+
# symbol_size=7, # Adjusts the size of the nodes (optional)
49+
)
50+
51+
chart.set_global_opts(
52+
title_opts=opts.TitleOpts(title="HTML Structure Tree"),
53+
tooltip_opts=opts.TooltipOpts(trigger="item", trigger_on="mousemove|click")
54+
)
55+
56+
# Render the tree to HTML file
57+
chart.render("html_structure_tree.html")
58+
html_file_path = chart.render("html_structure_tree.html")
59+
webbrowser.open(html_file_path)

0 commit comments

Comments
 (0)