|
| 1 | +from langchain_community.document_loaders import AsyncHtmlLoader |
| 2 | +import time |
| 3 | +from scrapegraphai.asdt import DOMTree |
| 4 | + |
| 5 | +def index_subtrees(subtrees): |
| 6 | + from collections import defaultdict |
| 7 | + structure_index = defaultdict(list) |
| 8 | + content_index = defaultdict(list) |
| 9 | + |
| 10 | + for subtree in subtrees: |
| 11 | + structure_hash = subtree.root.structure_hash |
| 12 | + content_hash = subtree.root.content_hash |
| 13 | + |
| 14 | + structure_index[structure_hash].append(subtree) |
| 15 | + content_index[content_hash].append(subtree) |
| 16 | + |
| 17 | + return structure_index, content_index |
| 18 | + |
| 19 | +def find_matching_subtrees(index): |
| 20 | + matches = [] |
| 21 | + for hash_key, subtrees in index.items(): |
| 22 | + if len(subtrees) > 1: |
| 23 | + # Generate pairs of matched subtrees |
| 24 | + for i in range(len(subtrees)): |
| 25 | + for j in range(i + 1, len(subtrees)): |
| 26 | + matches.append((subtrees[i], subtrees[j])) |
| 27 | + return matches |
| 28 | + |
| 29 | +def print_subtree_details(subtree): |
| 30 | + """ A helper function to print subtree details for comparison. """ |
| 31 | + nodes = [] |
| 32 | + subtree.traverse(lambda node: nodes.append(f"{node.value}: {node.attributes.get('content', '')}")) |
| 33 | + return " | ".join(nodes) |
| 34 | + |
| 35 | +def print_matches_side_by_side(matches): |
| 36 | + for match_pair in matches: |
| 37 | + subtree1, subtree2 = match_pair |
| 38 | + subtree1_details = print_subtree_details(subtree1) |
| 39 | + subtree2_details = print_subtree_details(subtree2) |
| 40 | + print("Match Pair:") |
| 41 | + print("Subtree 1:", subtree1_details) |
| 42 | + print("Subtree 2:", subtree2_details) |
| 43 | + print("\n" + "-"*100 + "\n") |
| 44 | + |
| 45 | +# ********************************************************************************************************************* |
| 46 | +# Usage example: |
| 47 | +# ********************************************************************************************************************* |
| 48 | + |
| 49 | +loader = AsyncHtmlLoader('https://perinim.github.io/projects/') |
| 50 | +document = loader.load() |
| 51 | +html_content = document[0].page_content |
| 52 | + |
| 53 | +curr_time = time.time() |
| 54 | +# Instantiate a DOMTree with HTML content |
| 55 | +dom_tree = DOMTree(html_content) |
| 56 | +# nodes, metadatas = dom_tree.collect_text_nodes() # Collect text nodes for analysis |
| 57 | +# for node, metadata in zip(nodes, metadatas): |
| 58 | +# print("Text:", node) |
| 59 | +# print("Metadata:", metadata) |
| 60 | + |
| 61 | +# sub_list = dom_tree.generate_subtree_dicts() # Generate subtree dictionaries for analysis |
| 62 | +# print(sub_list) |
| 63 | +# graph = dom_tree.visualize(exclude_tags=['script', 'style', 'meta', 'link']) |
| 64 | +subtrees = dom_tree.get_subtrees() # Retrieve subtrees rooted at fork nodes |
| 65 | +print("Number of subtrees found:", len(subtrees)) |
| 66 | + |
| 67 | +# remove trees whos root node does not lead to any text |
| 68 | +text_subtrees = [subtree for subtree in subtrees if subtree.root.leads_to_text] |
| 69 | +print("Number of subtrees that lead to text:", len(text_subtrees)) |
| 70 | + |
| 71 | +direct_leaf_subtrees = [subtree for subtree in text_subtrees if subtree.root.has_direct_leaves] |
| 72 | +print("Number of subtrees with direct leaves beneath fork nodes:", len(direct_leaf_subtrees)) |
| 73 | + |
| 74 | +for subtree in direct_leaf_subtrees: |
| 75 | + print("Subtree rooted at:", subtree.root.value) |
| 76 | + subtree.traverse(lambda node: print(node)) |
| 77 | +# Index subtrees by structure and content |
| 78 | +# structure_index, content_index = index_subtrees(subtrees) |
| 79 | + |
| 80 | +# # Find matches based on structure |
| 81 | +# structure_matches = find_matching_subtrees(structure_index) |
| 82 | +# print("Structure-based matches found:", len(structure_matches)) |
| 83 | + |
| 84 | +# # Print structure-based matches side by side |
| 85 | +# print_matches_side_by_side(structure_matches) |
| 86 | + |
| 87 | +# # Optionally, do the same for content-based matches if needed |
| 88 | +# content_matches = find_matching_subtrees(content_index) |
| 89 | +# print("Content-based matches found:", len(content_matches)) |
| 90 | +# print_matches_side_by_side(content_matches) |
| 91 | + |
| 92 | +print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds") |
| 93 | + |
| 94 | +# Optionally, traverse each subtree |
| 95 | +# for subtree in subtrees: |
| 96 | +# print("Subtree rooted at:", subtree.root.value) |
| 97 | +# subtree.traverse(lambda node: print(node)) |
| 98 | +# Traverse the DOMTree and print each node |
| 99 | +# dom_tree.traverse(lambda node: print(node)) |
0 commit comments