Skip to content

Commit dd99ac5

Browse files
committed
structural and textual hashing
1 parent c5f9fca commit dd99ac5

File tree

1 file changed

+87
-6
lines changed

1 file changed

+87
-6
lines changed

scrapegraphai/utils/aaa.py

Lines changed: 87 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,49 @@
33
from langchain_community.document_loaders import AsyncHtmlLoader
44
import time
55

6+
def hash_subtree_structure(node):
7+
""" Recursively generate a hash for the subtree structure. """
8+
if node.is_leaf:
9+
return hash((node.value,)) # Simple hash for leaf nodes
10+
child_hashes = tuple(hash_subtree_structure(child) for child in node.children)
11+
return hash((node.value, child_hashes))
12+
13+
def hash_subtree_content(node):
14+
""" Generate a hash based on the concatenated text of the subtree. """
15+
text_content = get_all_text(node).lower().strip()
16+
return hash(text_content)
17+
18+
def get_all_text(node):
19+
""" Recursively get all text from a node and its descendants. """
20+
text = node.attributes.get('content', '') if node.value == 'text' else ''
21+
for child in node.children:
22+
text += get_all_text(child)
23+
return text
24+
625
class TreeNode:
726
def __init__(self, value=None, attributes=None, children=None, parent=None, depth=0):
827
self.value = value
928
self.attributes = attributes if attributes is not None else {}
1029
self.children = children if children is not None else []
1130
self.parent = parent
1231
self.depth = depth
13-
self.leads_to_text = False # Initialize the flag as False
32+
self.leads_to_text = False
1433
self.root_path = self._compute_root_path()
1534
self.closest_fork_path = self._compute_fork_path()
35+
self.structure_hash = None
36+
self.content_hash = None
1637

1738
def add_child(self, child_node):
1839
child_node.parent = self
1940
child_node.depth = self.depth + 1
2041
self.children.append(child_node)
2142
child_node.update_paths()
22-
self.update_leads_to_text() # Update this node if the child leads to text
43+
self.update_leads_to_text()
44+
self.update_hashes() # Update hashes when the structure changes
45+
46+
def update_hashes(self):
47+
self.structure_hash = hash_subtree_structure(self)
48+
self.content_hash = hash_subtree_content(self)
2349

2450
def update_paths(self):
2551
self.root_path = self._compute_root_path()
@@ -59,7 +85,7 @@ def get_subtrees(self):
5985
for child in self.children:
6086
subtrees.extend(child.get_subtrees())
6187
return subtrees
62-
88+
6389
def __repr__(self):
6490
return f"TreeNode(value={self.value}, leads_to_text={self.leads_to_text}, depth={self.depth}, root_path={self.root_path}, closest_fork_path={self.closest_fork_path})"
6591

@@ -110,9 +136,49 @@ def build_dom_tree(self, soup_node, tree_node):
110136
tree_node.add_child(new_node)
111137
self.build_dom_tree(child, new_node)
112138

139+
def index_subtrees(subtrees):
140+
from collections import defaultdict
141+
structure_index = defaultdict(list)
142+
content_index = defaultdict(list)
143+
144+
for subtree in subtrees:
145+
structure_hash = subtree.root.structure_hash
146+
content_hash = subtree.root.content_hash
147+
148+
structure_index[structure_hash].append(subtree)
149+
content_index[content_hash].append(subtree)
150+
151+
return structure_index, content_index
152+
153+
def find_matching_subtrees(index):
154+
matches = []
155+
for hash_key, subtrees in index.items():
156+
if len(subtrees) > 1:
157+
# Generate pairs of matched subtrees
158+
for i in range(len(subtrees)):
159+
for j in range(i + 1, len(subtrees)):
160+
matches.append((subtrees[i], subtrees[j]))
161+
return matches
162+
163+
def print_subtree_details(subtree):
164+
""" A helper function to print subtree details for comparison. """
165+
nodes = []
166+
subtree.traverse(lambda node: nodes.append(f"{node.value}: {node.attributes.get('content', '')}"))
167+
return " | ".join(nodes)
168+
169+
def print_matches_side_by_side(matches):
170+
for match_pair in matches:
171+
subtree1, subtree2 = match_pair
172+
subtree1_details = print_subtree_details(subtree1)
173+
subtree2_details = print_subtree_details(subtree2)
174+
print("Match Pair:")
175+
print("Subtree 1:", subtree1_details)
176+
print("Subtree 2:", subtree2_details)
177+
print("\n" + "-"*100 + "\n")
178+
113179
# Usage example:
114180

115-
loader = AsyncHtmlLoader('https://github.com/PeriniM')
181+
loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
116182
document = loader.load()
117183
html_content = document[0].page_content
118184

@@ -121,11 +187,26 @@ def build_dom_tree(self, soup_node, tree_node):
121187
dom_tree = DOMTree(html_content)
122188
subtrees = dom_tree.get_subtrees() # Retrieve subtrees rooted at fork nodes
123189

190+
# Index subtrees by structure and content
191+
structure_index, content_index = index_subtrees(subtrees)
192+
193+
# Find matches based on structure
194+
structure_matches = find_matching_subtrees(structure_index)
195+
print("Structure-based matches found:", len(structure_matches))
196+
197+
# Print structure-based matches side by side
198+
print_matches_side_by_side(structure_matches)
199+
200+
# Optionally, do the same for content-based matches if needed
201+
content_matches = find_matching_subtrees(content_index)
202+
print("Content-based matches found:", len(content_matches))
203+
print_matches_side_by_side(content_matches)
204+
124205
print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds")
125206

126207
# Optionally, traverse each subtree
127-
for subtree in subtrees:
128-
print("Subtree rooted at:", subtree.root.value)
208+
# for subtree in subtrees:
209+
# print("Subtree rooted at:", subtree.root.value)
129210
# subtree.traverse(lambda node: print(node))
130211
# Traverse the DOMTree and print each node
131212
# dom_tree.traverse(lambda node: print(node))

0 commit comments

Comments
 (0)