structural and textual hashing

PeriniM · PeriniM · commit dd99ac595efb · 2024-04-25T11:56:41.000+02:00
diff --git a/scrapegraphai/utils/aaa.py b/scrapegraphai/utils/aaa.py
@@ -3,23 +3,49 @@
 from langchain_community.document_loaders import AsyncHtmlLoader
 import time
 
+def hash_subtree_structure(node):
+    """ Recursively generate a hash for the subtree structure. """
+    if node.is_leaf:
+        return hash((node.value,))  # Simple hash for leaf nodes
+    child_hashes = tuple(hash_subtree_structure(child) for child in node.children)
+    return hash((node.value, child_hashes))
+
+def hash_subtree_content(node):
+    """ Generate a hash based on the concatenated text of the subtree. """
+    text_content = get_all_text(node).lower().strip()
+    return hash(text_content)
+
+def get_all_text(node):
+    """ Recursively get all text from a node and its descendants. """
+    text = node.attributes.get('content', '') if node.value == 'text' else ''
+    for child in node.children:
+        text += get_all_text(child)
+    return text
+
 class TreeNode:
     def __init__(self, value=None, attributes=None, children=None, parent=None, depth=0):
         self.value = value
         self.attributes = attributes if attributes is not None else {}
         self.children = children if children is not None else []
         self.parent = parent
         self.depth = depth
-        self.leads_to_text = False  # Initialize the flag as False
+        self.leads_to_text = False
         self.root_path = self._compute_root_path()
         self.closest_fork_path = self._compute_fork_path()
+        self.structure_hash = None
+        self.content_hash = None
 
     def add_child(self, child_node):
         child_node.parent = self
         child_node.depth = self.depth + 1
         self.children.append(child_node)
         child_node.update_paths()
-        self.update_leads_to_text()  # Update this node if the child leads to text
+        self.update_leads_to_text()
+        self.update_hashes()  # Update hashes when the structure changes
+
+    def update_hashes(self):
+        self.structure_hash = hash_subtree_structure(self)
+        self.content_hash = hash_subtree_content(self)
 
     def update_paths(self):
         self.root_path = self._compute_root_path()
@@ -59,7 +85,7 @@ def get_subtrees(self):
         for child in self.children:
             subtrees.extend(child.get_subtrees())
         return subtrees
-    
+
     def __repr__(self):
         return f"TreeNode(value={self.value}, leads_to_text={self.leads_to_text}, depth={self.depth}, root_path={self.root_path}, closest_fork_path={self.closest_fork_path})"
 
@@ -110,9 +136,49 @@ def build_dom_tree(self, soup_node, tree_node):
                 tree_node.add_child(new_node)
                 self.build_dom_tree(child, new_node)
 
+def index_subtrees(subtrees):
+    from collections import defaultdict
+    structure_index = defaultdict(list)
+    content_index = defaultdict(list)
+
+    for subtree in subtrees:
+        structure_hash = subtree.root.structure_hash
+        content_hash = subtree.root.content_hash
+
+        structure_index[structure_hash].append(subtree)
+        content_index[content_hash].append(subtree)
+
+    return structure_index, content_index
+
+def find_matching_subtrees(index):
+    matches = []
+    for hash_key, subtrees in index.items():
+        if len(subtrees) > 1:
+            # Generate pairs of matched subtrees
+            for i in range(len(subtrees)):
+                for j in range(i + 1, len(subtrees)):
+                    matches.append((subtrees[i], subtrees[j]))
+    return matches
+
+def print_subtree_details(subtree):
+    """ A helper function to print subtree details for comparison. """
+    nodes = []
+    subtree.traverse(lambda node: nodes.append(f"{node.value}: {node.attributes.get('content', '')}"))
+    return " | ".join(nodes)
+
+def print_matches_side_by_side(matches):
+    for match_pair in matches:
+        subtree1, subtree2 = match_pair
+        subtree1_details = print_subtree_details(subtree1)
+        subtree2_details = print_subtree_details(subtree2)
+        print("Match Pair:")
+        print("Subtree 1:", subtree1_details)
+        print("Subtree 2:", subtree2_details)
+        print("\n" + "-"*100 + "\n")
+
 # Usage example:
 
-loader = AsyncHtmlLoader('https://github.com/PeriniM')
+loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
 document = loader.load()
 html_content = document[0].page_content
 
@@ -121,11 +187,26 @@ def build_dom_tree(self, soup_node, tree_node):
 dom_tree = DOMTree(html_content)
 subtrees = dom_tree.get_subtrees()  # Retrieve subtrees rooted at fork nodes
 
+# Index subtrees by structure and content
+structure_index, content_index = index_subtrees(subtrees)
+
+# Find matches based on structure
+structure_matches = find_matching_subtrees(structure_index)
+print("Structure-based matches found:", len(structure_matches))
+
+# Print structure-based matches side by side
+print_matches_side_by_side(structure_matches)
+
+# Optionally, do the same for content-based matches if needed
+content_matches = find_matching_subtrees(content_index)
+print("Content-based matches found:", len(content_matches))
+print_matches_side_by_side(content_matches)
+
 print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds")
 
 # Optionally, traverse each subtree
-for subtree in subtrees:
-    print("Subtree rooted at:", subtree.root.value)
+# for subtree in subtrees:
+#     print("Subtree rooted at:", subtree.root.value)
     # subtree.traverse(lambda node: print(node))
 # Traverse the DOMTree and print each node
 # dom_tree.traverse(lambda node: print(node))