11from bs4 import BeautifulSoup , NavigableString
22from graphviz import Digraph
33from langchain_community .document_loaders import AsyncHtmlLoader
4+ import json
5+ from bs4 import BeautifulSoup , NavigableString , Comment
6+ import time
47
5- def tag_structure (tag , include_scripts = True ):
8+ def tag_structure (tag , exclude = None ):
69 """
7- Recursively get a tag's structure, including its attributes, children, and textual content.
10+ Recursively get a tag's structure, including its attributes, children, and textual content,
11+ with an option to exclude specific tags. Text is treated as separate nodes.
12+
813 :param tag: BeautifulSoup tag object
9- :param include_scripts: Include or exclude <script> tags from the structure
10- :return: A dict with the tag's name, attributes, children, and text
14+ :param exclude: List of tag names to exclude from the structure
15+ :return: A dict with the tag's name, attributes, children, and text nodes
1116 """
12- if isinstance (tag , NavigableString ):
13- return tag .strip () if tag .strip () else None
17+ if exclude is None :
18+ exclude = []
19+
20+ if isinstance (tag , Comment ):
21+ return None # Ignore comments
1422
15- # Skip script tags if include_scripts is False
16- if not include_scripts and tag .name == 'script' :
17- return None
23+ if isinstance (tag , NavigableString ):
24+ text_content = tag .strip ()
25+ if text_content :
26+ text_node = {'text' : {
27+ 'content' : text_content ,
28+ 'children' : []
29+ }
30+ }
31+ return text_node
32+ else :
33+ return None
34+
35+ if tag .name in exclude :
36+ return None # Skip tags specified in the exclude list
1837
1938 tag_info = {
2039 'attrs' : dict (tag .attrs ),
2140 'children' : []
2241 }
2342
2443 for child in tag .children :
25- child_structure = tag_structure (child , include_scripts = include_scripts )
44+ child_structure = tag_structure (child , exclude = exclude )
2645 if child_structure :
46+ # Append structure or text node to children
2747 tag_info ['children' ].append (child_structure )
2848
2949 return {tag .name : tag_info }
3050
51+
3152# Function to recursively traverse the structured HTML dictionary and create graph nodes and edges
3253def add_nodes_edges (graph , structure , parent = None , include_scripts = True ):
3354 if isinstance (structure , dict ):
@@ -56,59 +77,69 @@ def add_nodes_edges(graph, structure, parent=None, include_scripts=True):
5677
5778def has_text_content (structure ):
5879 if isinstance (structure , str ) and structure .strip ():
80+ # If it's a string with non-whitespace characters, it's text content
5981 return True
6082 elif isinstance (structure , dict ):
61- for content in structure .values ():
62- if any (has_text_content (child ) for child in content ['children' ]):
63- return True
83+
84+ for key , value in structure .items ():
85+ if isinstance (value , list ):
86+ # It's a list, probably of children
87+ if any (has_text_content (child ) for child in value ):
88+ return True
89+ elif isinstance (value , dict ):
90+ # It's a dictionary, need to check recursively
91+ if has_text_content (value ):
92+ return True
6493 return False
6594
66- def add_text_nodes_only (graph , structure , parent = None , include_scripts = True ):
95+ def add_text_nodes_only (graph , structure , parent = None ):
6796 """
6897 Recursively traverse the structured HTML dictionary and create graph nodes and edges
69- for text content only.
98+ for text content only, using Graphviz Digraph object .
7099 :param graph: Graphviz Digraph object
71100 :param structure: Structured HTML dictionary
72101 :param parent: ID of the parent node
73102 :param include_scripts: Include or exclude <script> tags from the visualization
74103 """
75104 if isinstance (structure , dict ):
76105 for tag , content in structure .items ():
77- # Skip script tags if include_scripts is False
78- if not include_scripts and tag == 'script' :
79- continue
80-
81- has_text = any (has_text_content (child ) for child in content ['children' ])
82- if has_text :
106+
107+ if 'text' in content :
108+ # Content is a text node
109+ text_label = (content ['text' ][:30 ] + '...' ) if len (content ['text' ]) > 30 else content ['text' ]
110+ text_node_name = f"text_{ id (content )} "
111+ graph .node (text_node_name , label = text_label , shape = "plaintext" )
112+ if parent :
113+ graph .edge (parent , text_node_name )
114+ else :
115+ # Content is a tag with children
83116 node_name = f"{ tag } _{ id (content )} "
84117 graph .node (node_name , label = tag )
85118 if parent :
86119 graph .edge (parent , node_name )
87- for child in content ['children' ]:
88- add_text_nodes_only (graph , child , parent = node_name , include_scripts = include_scripts )
89- elif isinstance (structure , str ) and structure .strip ():
90- text_label = (structure [:30 ] + '..' ) if len (structure ) > 30 else structure
91- text_node_name = f"text_{ id (structure )} "
92- graph .node (text_node_name , label = text_label , shape = "plaintext" )
93- if parent :
94- graph .edge (parent , text_node_name )
120+ for child in content .get ('children' , []):
121+ add_text_nodes_only (graph , child , parent = node_name )
122+
95123
96- loader = AsyncHtmlLoader ('https://perinim.github.io/projects /' )
124+ loader = AsyncHtmlLoader ('https://www.mymovies.it/cinema/roma /' )
97125document = loader .load ()
98126html_content = document [0 ].page_content
127+
128+ curr_time = time .time ()
99129# Parse HTML content
100130soup = BeautifulSoup (html_content , 'html.parser' )
101131
102132# Generate and print structured HTML
103- html_structure = tag_structure (soup .find ('html' ))
104- # print(structure)
133+ html_structure = tag_structure (soup .find ('html' ), exclude = ['head' , 'style' , 'script' ])
134+ print (f"Time taken to generate structured HTML: { time .time () - curr_time :.2f} seconds" )
135+ # print(json.dumps(html_structure, indent=2))
105136
106137# Create a Digraph object
107138dot = Digraph ()
108139dot .attr (rankdir = 'LR' ) # Left to Right, change to 'TB' for Top to Bottom
109140
110141# Recursively add nodes and edges based on the structured HTML dictionary
111142# add_nodes_edges(dot, html_structure, include_scripts=False)
112- add_text_nodes_only (dot , html_structure , include_scripts = False )
143+ add_text_nodes_only (dot , html_structure )
113144# Render the graph to a file and view it
114145dot .render ('html_structure' , view = True , format = 'png' )
0 commit comments