1+ """
2+ Module for creating the tree
3+ """
4+ import time
15from bs4 import BeautifulSoup , NavigableString
26from graphviz import Digraph
37from langchain_community .document_loaders import AsyncHtmlLoader
4- import json
58from bs4 import BeautifulSoup , NavigableString , Comment
6- import time
79
8- def tag_structure (tag , exclude = None ):
10+
11+ def tag_structure (tag , exclude = None ) -> dict :
912 """
1013 Recursively get a tag's structure, including its attributes, children, and textual content,
1114 with an option to exclude specific tags. Text is treated as separate nodes.
12-
15+
1316 :param tag: BeautifulSoup tag object
1417 :param exclude: List of tag names to exclude from the structure
1518 :return: A dict with the tag's name, attributes, children, and text nodes
@@ -26,7 +29,7 @@ def tag_structure(tag, exclude=None):
2629 text_node = {'text' : {
2730 'content' : text_content ,
2831 'children' : []
29- }
32+ }
3033 }
3134 return text_node
3235 else :
@@ -62,19 +65,23 @@ def add_nodes_edges(graph, structure, parent=None, include_scripts=True):
6265 if parent :
6366 graph .edge (parent , node_name )
6467 # Recursively process the children nodes
65- add_nodes_edges (graph , content ['children' ], parent = node_name , include_scripts = include_scripts )
68+ add_nodes_edges (
69+ graph , content ['children' ], parent = node_name , include_scripts = include_scripts )
6670
6771 elif isinstance (structure , list ):
6872 for item in structure :
69- add_nodes_edges (graph , item , parent , include_scripts = include_scripts )
73+ add_nodes_edges (graph , item , parent ,
74+ include_scripts = include_scripts )
7075
7176 elif isinstance (structure , str ) and parent :
7277 # Adding text node with limited length to keep the visualization clean
73- text_label = (structure [:30 ] + '..' ) if len (structure ) > 30 else structure
78+ text_label = (structure [:30 ] +
79+ '..' ) if len (structure ) > 30 else structure
7480 text_node_name = f"text_{ id (structure )} "
7581 graph .node (text_node_name , label = text_label , shape = "plaintext" )
7682 graph .edge (parent , text_node_name )
7783
84+
7885def has_text_content (structure ):
7986 if isinstance (structure , str ) and structure .strip ():
8087 # If it's a string with non-whitespace characters, it's text content
@@ -92,6 +99,7 @@ def has_text_content(structure):
9299 return True
93100 return False
94101
102+
95103def add_text_nodes_only (graph , structure , parent = None ):
96104 """
97105 Recursively traverse the structured HTML dictionary and create graph nodes and edges
@@ -103,10 +111,11 @@ def add_text_nodes_only(graph, structure, parent=None):
103111 """
104112 if isinstance (structure , dict ):
105113 for tag , content in structure .items ():
106-
114+
107115 if 'text' in content :
108116 # Content is a text node
109- text_label = (content ['text' ][:30 ] + '...' ) if len (content ['text' ]) > 30 else content ['text' ]
117+ text_label = (
118+ content ['text' ][:30 ] + '...' ) if len (content ['text' ]) > 30 else content ['text' ]
110119 text_node_name = f"text_{ id (content )} "
111120 graph .node (text_node_name , label = text_label , shape = "plaintext" )
112121 if parent :
@@ -130,8 +139,10 @@ def add_text_nodes_only(graph, structure, parent=None):
130139soup = BeautifulSoup (html_content , 'html.parser' )
131140
132141# Generate and print structured HTML
133- html_structure = tag_structure (soup .find ('html' ), exclude = ['head' , 'style' , 'script' ])
134- print (f"Time taken to generate structured HTML: { time .time () - curr_time :.2f} seconds" )
142+ html_structure = tag_structure (soup .find ('html' ), exclude = [
143+ 'head' , 'style' , 'script' ])
144+ print (
145+ f"Time taken to generate structured HTML: { time .time () - curr_time :.2f} seconds" )
135146# print(json.dumps(html_structure, indent=2))
136147
137148# Create a Digraph object
@@ -142,4 +153,4 @@ def add_text_nodes_only(graph, structure, parent=None):
142153# add_nodes_edges(dot, html_structure, include_scripts=False)
143154add_text_nodes_only (dot , html_structure )
144155# Render the graph to a file and view it
145- dot .render ('html_structure' , view = True , format = 'png' )
156+ dot .render ('html_structure' , view = True , format = 'png' )
0 commit comments