Skip to content

Commit c5f9fca

Browse files
committed
Merge branch 'asdt' of https://github.com/VinciGit00/Scrapegraph-ai into asdt
2 parents c927f70 + f232717 commit c5f9fca

File tree

1 file changed

+24
-13
lines changed

1 file changed

+24
-13
lines changed

scrapegraphai/utils/asdt.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
1+
"""
2+
Module for creating the tree
3+
"""
4+
import time
15
from bs4 import BeautifulSoup, NavigableString
26
from graphviz import Digraph
37
from langchain_community.document_loaders import AsyncHtmlLoader
4-
import json
58
from bs4 import BeautifulSoup, NavigableString, Comment
6-
import time
79

8-
def tag_structure(tag, exclude=None):
10+
11+
def tag_structure(tag, exclude=None) -> dict:
912
"""
1013
Recursively get a tag's structure, including its attributes, children, and textual content,
1114
with an option to exclude specific tags. Text is treated as separate nodes.
12-
15+
1316
:param tag: BeautifulSoup tag object
1417
:param exclude: List of tag names to exclude from the structure
1518
:return: A dict with the tag's name, attributes, children, and text nodes
@@ -26,7 +29,7 @@ def tag_structure(tag, exclude=None):
2629
text_node = {'text': {
2730
'content': text_content,
2831
'children': []
29-
}
32+
}
3033
}
3134
return text_node
3235
else:
@@ -62,19 +65,23 @@ def add_nodes_edges(graph, structure, parent=None, include_scripts=True):
6265
if parent:
6366
graph.edge(parent, node_name)
6467
# Recursively process the children nodes
65-
add_nodes_edges(graph, content['children'], parent=node_name, include_scripts=include_scripts)
68+
add_nodes_edges(
69+
graph, content['children'], parent=node_name, include_scripts=include_scripts)
6670

6771
elif isinstance(structure, list):
6872
for item in structure:
69-
add_nodes_edges(graph, item, parent, include_scripts=include_scripts)
73+
add_nodes_edges(graph, item, parent,
74+
include_scripts=include_scripts)
7075

7176
elif isinstance(structure, str) and parent:
7277
# Adding text node with limited length to keep the visualization clean
73-
text_label = (structure[:30] + '..') if len(structure) > 30 else structure
78+
text_label = (structure[:30] +
79+
'..') if len(structure) > 30 else structure
7480
text_node_name = f"text_{id(structure)}"
7581
graph.node(text_node_name, label=text_label, shape="plaintext")
7682
graph.edge(parent, text_node_name)
7783

84+
7885
def has_text_content(structure):
7986
if isinstance(structure, str) and structure.strip():
8087
# If it's a string with non-whitespace characters, it's text content
@@ -92,6 +99,7 @@ def has_text_content(structure):
9299
return True
93100
return False
94101

102+
95103
def add_text_nodes_only(graph, structure, parent=None):
96104
"""
97105
Recursively traverse the structured HTML dictionary and create graph nodes and edges
@@ -103,10 +111,11 @@ def add_text_nodes_only(graph, structure, parent=None):
103111
"""
104112
if isinstance(structure, dict):
105113
for tag, content in structure.items():
106-
114+
107115
if 'text' in content:
108116
# Content is a text node
109-
text_label = (content['text'][:30] + '...') if len(content['text']) > 30 else content['text']
117+
text_label = (
118+
content['text'][:30] + '...') if len(content['text']) > 30 else content['text']
110119
text_node_name = f"text_{id(content)}"
111120
graph.node(text_node_name, label=text_label, shape="plaintext")
112121
if parent:
@@ -130,8 +139,10 @@ def add_text_nodes_only(graph, structure, parent=None):
130139
soup = BeautifulSoup(html_content, 'html.parser')
131140

132141
# Generate and print structured HTML
133-
html_structure = tag_structure(soup.find('html'), exclude=['head', 'style', 'script'])
134-
print(f"Time taken to generate structured HTML: {time.time() - curr_time:.2f} seconds")
142+
html_structure = tag_structure(soup.find('html'), exclude=[
143+
'head', 'style', 'script'])
144+
print(
145+
f"Time taken to generate structured HTML: {time.time() - curr_time:.2f} seconds")
135146
# print(json.dumps(html_structure, indent=2))
136147

137148
# Create a Digraph object
@@ -142,4 +153,4 @@ def add_text_nodes_only(graph, structure, parent=None):
142153
# add_nodes_edges(dot, html_structure, include_scripts=False)
143154
add_text_nodes_only(dot, html_structure)
144155
# Render the graph to a file and view it
145-
dot.render('html_structure', view=True, format='png')
156+
dot.render('html_structure', view=True, format='png')

0 commit comments

Comments
 (0)