dom tree structure

PeriniM · PeriniM · commit ae69e4b3406e · 2024-04-24T18:59:44.000+02:00
diff --git a/scrapegraphai/utils/aaa.py b/scrapegraphai/utils/aaa.py
@@ -0,0 +1,111 @@
+from bs4 import BeautifulSoup
+from bs4.element import Tag, NavigableString, Comment
+from langchain_community.document_loaders import AsyncHtmlLoader
+import time
+
+class TreeNode:
+    def __init__(self, value=None, attributes=None, children=None, parent=None):
+        self.value = value
+        self.attributes = attributes if attributes is not None else {}
+        self.children = children if children is not None else []
+        self.parent = parent
+        self.leads_to_text = False  # Initialize the flag as False
+        self.root_path = self._compute_root_path()
+        self.closest_fork_path = self._compute_fork_path()
+
+    def add_child(self, child_node):
+        child_node.parent = self
+        self.children.append(child_node)
+        child_node.update_paths()
+        self.update_leads_to_text()  # Update this node if the child leads to text
+
+    def update_paths(self):
+        self.root_path = self._compute_root_path()
+        self.closest_fork_path = self._compute_fork_path()
+
+    def update_leads_to_text(self):
+        # Check if any child leads to text or is a text node
+        if any(child.value == 'text' or child.leads_to_text for child in self.children):
+            self.leads_to_text = True
+        # Update the flag up the tree
+        if self.parent and not self.parent.leads_to_text:
+            self.parent.update_leads_to_text()
+
+    def _compute_root_path(self):
+        path = []
+        current = self
+        while current.parent:
+            path.append(current.value)
+            current = current.parent
+        path.append('root')  # Append 'root' to start of the path
+        return '>'.join(reversed(path))
+
+    def _compute_fork_path(self):
+        path = []
+        current = self
+        while current.parent and len(current.parent.children) == 1:
+            path.append(current.value)
+            current = current.parent
+        path.append(current.value)  # Add the fork or root node
+        return '>'.join(reversed(path))
+
+    def __repr__(self):
+        return f"TreeNode(value={self.value}, leads_to_text={self.leads_to_text}, root_path={self.root_path}, closest_fork_path={self.closest_fork_path})"
+
+    @property
+    def is_fork(self):
+        return len(self.children) > 1
+
+    @property
+    def is_leaf(self):
+        return len(self.children) == 0
+
+class Tree:
+    def __init__(self, root=None):
+        self.root = root
+
+    def traverse(self, visit_func):
+        def _traverse(node):
+            if node:
+                visit_func(node)
+                for child in node.children:
+                    _traverse(child)
+        _traverse(self.root)
+
+    def __repr__(self):
+        return f"Tree(root={self.root})"
+
+
+class DOMTree(Tree):
+    def __init__(self, html_content):
+        super().__init__()
+        self.root = TreeNode('document')
+        self.build_dom_tree(BeautifulSoup(html_content, 'html.parser'), self.root)
+
+    def build_dom_tree(self, soup_node, tree_node):
+        for child in soup_node.children:
+            if isinstance(child, Comment):
+                continue  # Skip comments
+            elif isinstance(child, NavigableString):
+                text = child.strip()
+                if text:
+                    # Create a text node with value 'text' and the actual content under 'content' key
+                    tree_node.add_child(TreeNode(value='text', attributes={'content': text}))
+            elif isinstance(child, Tag):
+                new_node = TreeNode(value=child.name, attributes=child.attrs)
+                tree_node.add_child(new_node)
+                self.build_dom_tree(child, new_node)
+
+# Usage example:
+
+loader = AsyncHtmlLoader('https://www.mymovies.it/cinema/roma/')
+document = loader.load()
+html_content = document[0].page_content
+
+curr_time = time.time()
+# Instantiate a DOMTree with HTML content
+dom_tree = DOMTree(html_content)
+print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds")
+
+# Traverse the DOMTree and print each node
+# dom_tree.traverse(lambda node: print(node))
diff --git a/scrapegraphai/utils/asdt.py b/scrapegraphai/utils/asdt.py
@@ -1,33 +1,54 @@
 from bs4 import BeautifulSoup, NavigableString
 from graphviz import Digraph
 from langchain_community.document_loaders import AsyncHtmlLoader
+import json
+from bs4 import BeautifulSoup, NavigableString, Comment
+import time
 
-def tag_structure(tag, include_scripts=True):
+def tag_structure(tag, exclude=None):
     """
-    Recursively get a tag's structure, including its attributes, children, and textual content.
+    Recursively get a tag's structure, including its attributes, children, and textual content,
+    with an option to exclude specific tags. Text is treated as separate nodes.
+    
     :param tag: BeautifulSoup tag object
-    :param include_scripts: Include or exclude <script> tags from the structure
-    :return: A dict with the tag's name, attributes, children, and text
+    :param exclude: List of tag names to exclude from the structure
+    :return: A dict with the tag's name, attributes, children, and text nodes
     """
-    if isinstance(tag, NavigableString):
-        return tag.strip() if tag.strip() else None
+    if exclude is None:
+        exclude = []
+
+    if isinstance(tag, Comment):
+        return None  # Ignore comments
 
-    # Skip script tags if include_scripts is False
-    if not include_scripts and tag.name == 'script':
-        return None
+    if isinstance(tag, NavigableString):
+        text_content = tag.strip()
+        if text_content:
+            text_node = {'text': {
+                'content': text_content,
+                'children': []
+                }
+            }
+            return text_node
+        else:
+            return None
+
+    if tag.name in exclude:
+        return None  # Skip tags specified in the exclude list
 
     tag_info = {
         'attrs': dict(tag.attrs),
         'children': []
     }
 
     for child in tag.children:
-        child_structure = tag_structure(child, include_scripts=include_scripts)
+        child_structure = tag_structure(child, exclude=exclude)
         if child_structure:
+            # Append structure or text node to children
             tag_info['children'].append(child_structure)
 
     return {tag.name: tag_info}
 
+
 # Function to recursively traverse the structured HTML dictionary and create graph nodes and edges
 def add_nodes_edges(graph, structure, parent=None, include_scripts=True):
     if isinstance(structure, dict):
@@ -56,59 +77,69 @@ def add_nodes_edges(graph, structure, parent=None, include_scripts=True):
 
 def has_text_content(structure):
     if isinstance(structure, str) and structure.strip():
+        # If it's a string with non-whitespace characters, it's text content
         return True
     elif isinstance(structure, dict):
-        for content in structure.values():
-            if any(has_text_content(child) for child in content['children']):
-                return True
+
+        for key, value in structure.items():
+            if isinstance(value, list):
+                # It's a list, probably of children
+                if any(has_text_content(child) for child in value):
+                    return True
+            elif isinstance(value, dict):
+                # It's a dictionary, need to check recursively
+                if has_text_content(value):
+                    return True
     return False
 
-def add_text_nodes_only(graph, structure, parent=None, include_scripts=True):
+def add_text_nodes_only(graph, structure, parent=None):
     """
     Recursively traverse the structured HTML dictionary and create graph nodes and edges
-    for text content only.
+    for text content only, using Graphviz Digraph object.
     :param graph: Graphviz Digraph object
     :param structure: Structured HTML dictionary
     :param parent: ID of the parent node
     :param include_scripts: Include or exclude <script> tags from the visualization
     """
     if isinstance(structure, dict):
         for tag, content in structure.items():
-            # Skip script tags if include_scripts is False
-            if not include_scripts and tag == 'script':
-                continue
-
-            has_text = any(has_text_content(child) for child in content['children'])
-            if has_text:
+     
+            if 'text' in content:
+                # Content is a text node
+                text_label = (content['text'][:30] + '...') if len(content['text']) > 30 else content['text']
+                text_node_name = f"text_{id(content)}"
+                graph.node(text_node_name, label=text_label, shape="plaintext")
+                if parent:
+                    graph.edge(parent, text_node_name)
+            else:
+                # Content is a tag with children
                 node_name = f"{tag}_{id(content)}"
                 graph.node(node_name, label=tag)
                 if parent:
                     graph.edge(parent, node_name)
-                for child in content['children']:
-                    add_text_nodes_only(graph, child, parent=node_name, include_scripts=include_scripts)
-    elif isinstance(structure, str) and structure.strip():
-        text_label = (structure[:30] + '..') if len(structure) > 30 else structure
-        text_node_name = f"text_{id(structure)}"
-        graph.node(text_node_name, label=text_label, shape="plaintext")
-        if parent:
-            graph.edge(parent, text_node_name)
+                for child in content.get('children', []):
+                    add_text_nodes_only(graph, child, parent=node_name)
+
 
-loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
+loader = AsyncHtmlLoader('https://www.mymovies.it/cinema/roma/')
 document = loader.load()
 html_content = document[0].page_content
+
+curr_time = time.time()
 # Parse HTML content
 soup = BeautifulSoup(html_content, 'html.parser')
 
 # Generate and print structured HTML
-html_structure = tag_structure(soup.find('html'))
-# print(structure)
+html_structure = tag_structure(soup.find('html'), exclude=['head', 'style', 'script'])
+print(f"Time taken to generate structured HTML: {time.time() - curr_time:.2f} seconds")
+# print(json.dumps(html_structure, indent=2))
 
 # Create a Digraph object
 dot = Digraph()
 dot.attr(rankdir='LR')  # Left to Right, change to 'TB' for Top to Bottom
 
 # Recursively add nodes and edges based on the structured HTML dictionary
 # add_nodes_edges(dot, html_structure, include_scripts=False)
-add_text_nodes_only(dot, html_structure, include_scripts=False)
+add_text_nodes_only(dot, html_structure)
 # Render the graph to a file and view it
 dot.render('html_structure', view=True, format='png')