33from langchain_community .document_loaders import AsyncHtmlLoader
44import time
55
6+ def hash_subtree_structure (node ):
7+ """ Recursively generate a hash for the subtree structure. """
8+ if node .is_leaf :
9+ return hash ((node .value ,)) # Simple hash for leaf nodes
10+ child_hashes = tuple (hash_subtree_structure (child ) for child in node .children )
11+ return hash ((node .value , child_hashes ))
12+
13+ def hash_subtree_content (node ):
14+ """ Generate a hash based on the concatenated text of the subtree. """
15+ text_content = get_all_text (node ).lower ().strip ()
16+ return hash (text_content )
17+
18+ def get_all_text (node ):
19+ """ Recursively get all text from a node and its descendants. """
20+ text = node .attributes .get ('content' , '' ) if node .value == 'text' else ''
21+ for child in node .children :
22+ text += get_all_text (child )
23+ return text
24+
625class TreeNode :
726 def __init__ (self , value = None , attributes = None , children = None , parent = None , depth = 0 ):
827 self .value = value
928 self .attributes = attributes if attributes is not None else {}
1029 self .children = children if children is not None else []
1130 self .parent = parent
1231 self .depth = depth
13- self .leads_to_text = False # Initialize the flag as False
32+ self .leads_to_text = False
1433 self .root_path = self ._compute_root_path ()
1534 self .closest_fork_path = self ._compute_fork_path ()
35+ self .structure_hash = None
36+ self .content_hash = None
1637
1738 def add_child (self , child_node ):
1839 child_node .parent = self
1940 child_node .depth = self .depth + 1
2041 self .children .append (child_node )
2142 child_node .update_paths ()
22- self .update_leads_to_text () # Update this node if the child leads to text
43+ self .update_leads_to_text ()
44+ self .update_hashes () # Update hashes when the structure changes
45+
46+ def update_hashes (self ):
47+ self .structure_hash = hash_subtree_structure (self )
48+ self .content_hash = hash_subtree_content (self )
2349
2450 def update_paths (self ):
2551 self .root_path = self ._compute_root_path ()
@@ -59,7 +85,7 @@ def get_subtrees(self):
5985 for child in self .children :
6086 subtrees .extend (child .get_subtrees ())
6187 return subtrees
62-
88+
6389 def __repr__ (self ):
6490 return f"TreeNode(value={ self .value } , leads_to_text={ self .leads_to_text } , depth={ self .depth } , root_path={ self .root_path } , closest_fork_path={ self .closest_fork_path } )"
6591
@@ -110,9 +136,49 @@ def build_dom_tree(self, soup_node, tree_node):
110136 tree_node .add_child (new_node )
111137 self .build_dom_tree (child , new_node )
112138
139+ def index_subtrees (subtrees ):
140+ from collections import defaultdict
141+ structure_index = defaultdict (list )
142+ content_index = defaultdict (list )
143+
144+ for subtree in subtrees :
145+ structure_hash = subtree .root .structure_hash
146+ content_hash = subtree .root .content_hash
147+
148+ structure_index [structure_hash ].append (subtree )
149+ content_index [content_hash ].append (subtree )
150+
151+ return structure_index , content_index
152+
153+ def find_matching_subtrees (index ):
154+ matches = []
155+ for hash_key , subtrees in index .items ():
156+ if len (subtrees ) > 1 :
157+ # Generate pairs of matched subtrees
158+ for i in range (len (subtrees )):
159+ for j in range (i + 1 , len (subtrees )):
160+ matches .append ((subtrees [i ], subtrees [j ]))
161+ return matches
162+
163+ def print_subtree_details (subtree ):
164+ """ A helper function to print subtree details for comparison. """
165+ nodes = []
166+ subtree .traverse (lambda node : nodes .append (f"{ node .value } : { node .attributes .get ('content' , '' )} " ))
167+ return " | " .join (nodes )
168+
169+ def print_matches_side_by_side (matches ):
170+ for match_pair in matches :
171+ subtree1 , subtree2 = match_pair
172+ subtree1_details = print_subtree_details (subtree1 )
173+ subtree2_details = print_subtree_details (subtree2 )
174+ print ("Match Pair:" )
175+ print ("Subtree 1:" , subtree1_details )
176+ print ("Subtree 2:" , subtree2_details )
177+ print ("\n " + "-" * 100 + "\n " )
178+
113179# Usage example:
114180
115- loader = AsyncHtmlLoader ('https://github.com/PeriniM ' )
181+ loader = AsyncHtmlLoader ('https://perinim. github.io/projects/ ' )
116182document = loader .load ()
117183html_content = document [0 ].page_content
118184
@@ -121,11 +187,26 @@ def build_dom_tree(self, soup_node, tree_node):
121187dom_tree = DOMTree (html_content )
122188subtrees = dom_tree .get_subtrees () # Retrieve subtrees rooted at fork nodes
123189
190+ # Index subtrees by structure and content
191+ structure_index , content_index = index_subtrees (subtrees )
192+
193+ # Find matches based on structure
194+ structure_matches = find_matching_subtrees (structure_index )
195+ print ("Structure-based matches found:" , len (structure_matches ))
196+
197+ # Print structure-based matches side by side
198+ print_matches_side_by_side (structure_matches )
199+
200+ # Optionally, do the same for content-based matches if needed
201+ content_matches = find_matching_subtrees (content_index )
202+ print ("Content-based matches found:" , len (content_matches ))
203+ print_matches_side_by_side (content_matches )
204+
124205print (f"Time taken to build DOM tree: { time .time () - curr_time :.2f} seconds" )
125206
126207# Optionally, traverse each subtree
127- for subtree in subtrees :
128- print ("Subtree rooted at:" , subtree .root .value )
208+ # for subtree in subtrees:
209+ # print("Subtree rooted at:", subtree.root.value)
129210 # subtree.traverse(lambda node: print(node))
130211# Traverse the DOMTree and print each node
131212# dom_tree.traverse(lambda node: print(node))
0 commit comments