1+ from bs4 import BeautifulSoup , NavigableString
2+ from graphviz import Digraph
3+ from langchain_community .document_loaders import AsyncHtmlLoader
4+
5+ def tag_structure (tag , include_scripts = True ):
6+ """
7+ Recursively get a tag's structure, including its attributes, children, and textual content.
8+ :param tag: BeautifulSoup tag object
9+ :param include_scripts: Include or exclude <script> tags from the structure
10+ :return: A dict with the tag's name, attributes, children, and text
11+ """
12+ if isinstance (tag , NavigableString ):
13+ return tag .strip () if tag .strip () else None
14+
15+ # Skip script tags if include_scripts is False
16+ if not include_scripts and tag .name == 'script' :
17+ return None
18+
19+ tag_info = {
20+ 'attrs' : dict (tag .attrs ),
21+ 'children' : []
22+ }
23+
24+ for child in tag .children :
25+ child_structure = tag_structure (child , include_scripts = include_scripts )
26+ if child_structure :
27+ tag_info ['children' ].append (child_structure )
28+
29+ return {tag .name : tag_info }
30+
31+ # Function to recursively traverse the structured HTML dictionary and create graph nodes and edges
32+ def add_nodes_edges (graph , structure , parent = None , include_scripts = True ):
33+ if isinstance (structure , dict ):
34+ for tag , content in structure .items ():
35+ # Skip script tags if include_scripts is False
36+ if tag == 'script' and not include_scripts :
37+ continue
38+
39+ node_name = f"{ tag } _{ id (content )} " # Unique node name
40+ graph .node (node_name , label = tag )
41+ if parent :
42+ graph .edge (parent , node_name )
43+ # Recursively process the children nodes
44+ add_nodes_edges (graph , content ['children' ], parent = node_name , include_scripts = include_scripts )
45+
46+ elif isinstance (structure , list ):
47+ for item in structure :
48+ add_nodes_edges (graph , item , parent , include_scripts = include_scripts )
49+
50+ elif isinstance (structure , str ) and parent :
51+ # Adding text node with limited length to keep the visualization clean
52+ text_label = (structure [:30 ] + '..' ) if len (structure ) > 30 else structure
53+ text_node_name = f"text_{ id (structure )} "
54+ graph .node (text_node_name , label = text_label , shape = "plaintext" )
55+ graph .edge (parent , text_node_name )
56+
57+ def has_text_content (structure ):
58+ if isinstance (structure , str ) and structure .strip ():
59+ return True
60+ elif isinstance (structure , dict ):
61+ for content in structure .values ():
62+ if any (has_text_content (child ) for child in content ['children' ]):
63+ return True
64+ return False
65+
66+ def add_text_nodes_only (graph , structure , parent = None , include_scripts = True ):
67+ """
68+ Recursively traverse the structured HTML dictionary and create graph nodes and edges
69+ for text content only.
70+ :param graph: Graphviz Digraph object
71+ :param structure: Structured HTML dictionary
72+ :param parent: ID of the parent node
73+ :param include_scripts: Include or exclude <script> tags from the visualization
74+ """
75+ if isinstance (structure , dict ):
76+ for tag , content in structure .items ():
77+ # Skip script tags if include_scripts is False
78+ if not include_scripts and tag == 'script' :
79+ continue
80+
81+ has_text = any (has_text_content (child ) for child in content ['children' ])
82+ if has_text :
83+ node_name = f"{ tag } _{ id (content )} "
84+ graph .node (node_name , label = tag )
85+ if parent :
86+ graph .edge (parent , node_name )
87+ for child in content ['children' ]:
88+ add_text_nodes_only (graph , child , parent = node_name , include_scripts = include_scripts )
89+ elif isinstance (structure , str ) and structure .strip ():
90+ text_label = (structure [:30 ] + '..' ) if len (structure ) > 30 else structure
91+ text_node_name = f"text_{ id (structure )} "
92+ graph .node (text_node_name , label = text_label , shape = "plaintext" )
93+ if parent :
94+ graph .edge (parent , text_node_name )
95+
96+ loader = AsyncHtmlLoader ('https://perinim.github.io/projects/' )
97+ document = loader .load ()
98+ html_content = document [0 ].page_content
99+ # Parse HTML content
100+ soup = BeautifulSoup (html_content , 'html.parser' )
101+
102+ # Generate and print structured HTML
103+ html_structure = tag_structure (soup .find ('html' ))
104+ # print(structure)
105+
106+ # Create a Digraph object
107+ dot = Digraph ()
108+ dot .attr (rankdir = 'LR' ) # Left to Right, change to 'TB' for Top to Bottom
109+
110+ # Recursively add nodes and edges based on the structured HTML dictionary
111+ # add_nodes_edges(dot, html_structure, include_scripts=False)
112+ add_text_nodes_only (dot , html_structure , include_scripts = False )
113+ # Render the graph to a file and view it
114+ dot .render ('html_structure' , view = True , format = 'png' )
0 commit comments