ScrapeGraphAI
diff --git a/‎examples/custom_graph_domtree.py‎
Lines changed: 171 additions & 0 deletions b/‎examples/custom_graph_domtree.py‎
Lines changed: 171 additions & 0 deletions
diff --git a/‎examples/domtree_example.py‎
Lines changed: 99 additions & 0 deletions b/‎examples/domtree_example.py‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎examples/faiss_vector.py‎
Lines changed: 34 additions & 0 deletions b/‎examples/faiss_vector.py‎
Lines changed: 34 additions & 0 deletions
@@ -0,0 +1,171 @@
+"""
+Example of custom graph using existing nodes
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.models import OpenAI
+from scrapegraphai.graphs import BaseGraph
+from scrapegraphai.nodes import FetchNode, GenerateAnswerNode
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": openai_key,
+        "model": "gpt-3.5-turbo",
+        "temperature": 0,
+        "streaming": True
+    },
+}
+
+# ************************************************
+# Define the graph nodes
+# ************************************************
+
+llm_model = OpenAI(graph_config["llm"])
+
+# define the nodes for the graph
+fetch_node = FetchNode(
+    input="url | local_dir",
+    output=["doc"],
+)
+generate_answer_node = GenerateAnswerNode(
+    input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+    output=["answer"],
+    node_config={"llm": llm_model},
+)
+
+# ************************************************
+# Create the graph by defining the connections
+# ************************************************
+
+graph = BaseGraph(
+    nodes={
+        fetch_node,
+        generate_answer_node,
+    },
+    edges={
+        (fetch_node, generate_answer_node)
+    },
+    entry_point=fetch_node
+)
+
+# ************************************************
+# Execute the graph
+# ************************************************
+
+subtree_text = '''
+div>div -> "This is a paragraph" \n
+div>ul>li>a>span -> "This is a list item 1" \n
+div>ul>li>a>span -> "This is a list item 2" \n
+div>ul>li>a>span -> "This is a list item 3"
+'''
+
+subtree_simplified_html = '''
+<div>
+    <div>This is a paragraph</div>
+    <ul>
+        <li>
+            <span>This is a list item 1</span>
+        </li>
+        <li>
+            <span>This is a list item 2</span>
+        </li>
+        <li>
+            <span>This is a list item 3</span>
+        </li>
+    </ul>
+</div>
+'''
+
+subtree_dict_simple = {
+    "div": {
+        "text": {
+            "content": "This is a paragraph",
+            "path_to_fork": "div>div",
+        },
+        "ul": {
+            "path_to_fork": "div>ul",
+            "texts": [
+                {
+                    "content": "This is a list item 1",
+                    "path_to_fork": "ul>li>a>span",
+                },
+                {
+                    "content": "This is a list item 2",
+                    "path_to_fork": "ul>li>a>span",
+                },
+                {
+                    "content": "This is a list item 3",
+                    "path_to_fork": "ul>li>a>span",
+                }
+            ]
+        }
+    }
+}
+
+
+subtree_dict_complex = {
+    "div": {
+        "text": {
+            "content": "This is a paragraph",
+            "path_to_fork": "div>div",
+            "attributes": {
+                "classes": ["paragraph"],
+                "ids": ["paragraph"],
+                "hrefs": ["https://www.example.com"]
+            }
+        },
+        "ul": {
+            "text1":{
+                "content": "This is a list item 1",
+                "path_to_fork": "ul>li>a>span",
+                "attributes": {
+                    "classes": ["list-item", "item-1"],
+                    "ids": ["item-1"],
+                    "hrefs": ["https://www.example.com"]
+                }
+            },
+            "text2":{
+                "content": "This is a list item 2",
+                "path_to_fork": "ul>li>a>span",
+                "attributes": {
+                    "classes": ["list-item", "item-2"],
+                    "ids": ["item-2"],
+                    "hrefs": ["https://www.example.com"]
+                }
+            }
+        }
+    }
+}
+
+from playwright.sync_api import sync_playwright, Playwright
+
+def run(playwright: Playwright):
+    chromium = playwright.chromium # or "firefox" or "webkit".
+    browser = chromium.launch()
+    page = browser.new_page()
+    page.goto("https://www.wired.com/category/science/")
+    #get accessibilty tree
+    accessibility_tree = page.accessibility.snapshot()
+
+    result, execution_info = graph.execute({
+        "user_prompt": "List me all the latest news with their description.",
+        "local_dir": str(accessibility_tree)
+    })
+
+    # get the answer from the result
+    result = result.get("answer", "No answer found.")
+    print(result)
+    # other actions...
+    browser.close()
+
+with sync_playwright() as playwright:
+    run(playwright)
+
@@ -0,0 +1,99 @@
+from langchain_community.document_loaders import AsyncHtmlLoader
+import time
+from scrapegraphai.asdt import DOMTree
+
+def index_subtrees(subtrees):
+    from collections import defaultdict
+    structure_index = defaultdict(list)
+    content_index = defaultdict(list)
+
+    for subtree in subtrees:
+        structure_hash = subtree.root.structure_hash
+        content_hash = subtree.root.content_hash
+
+        structure_index[structure_hash].append(subtree)
+        content_index[content_hash].append(subtree)
+
+    return structure_index, content_index
+
+def find_matching_subtrees(index):
+    matches = []
+    for hash_key, subtrees in index.items():
+        if len(subtrees) > 1:
+            # Generate pairs of matched subtrees
+            for i in range(len(subtrees)):
+                for j in range(i + 1, len(subtrees)):
+                    matches.append((subtrees[i], subtrees[j]))
+    return matches
+
+def print_subtree_details(subtree):
+    """ A helper function to print subtree details for comparison. """
+    nodes = []
+    subtree.traverse(lambda node: nodes.append(f"{node.value}: {node.attributes.get('content', '')}"))
+    return " | ".join(nodes)
+
+def print_matches_side_by_side(matches):
+    for match_pair in matches:
+        subtree1, subtree2 = match_pair
+        subtree1_details = print_subtree_details(subtree1)
+        subtree2_details = print_subtree_details(subtree2)
+        print("Match Pair:")
+        print("Subtree 1:", subtree1_details)
+        print("Subtree 2:", subtree2_details)
+        print("\n" + "-"*100 + "\n")
+
+# *********************************************************************************************************************
+# Usage example:
+# *********************************************************************************************************************
+
+loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
+document = loader.load()
+html_content = document[0].page_content
+
+curr_time = time.time()
+# Instantiate a DOMTree with HTML content
+dom_tree = DOMTree(html_content)
+# nodes, metadatas = dom_tree.collect_text_nodes()  # Collect text nodes for analysis
+# for node, metadata in zip(nodes, metadatas):
+#     print("Text:", node)
+#     print("Metadata:", metadata)
+
+# sub_list = dom_tree.generate_subtree_dicts()  # Generate subtree dictionaries for analysis
+# print(sub_list)
+# graph = dom_tree.visualize(exclude_tags=['script', 'style', 'meta', 'link'])
+subtrees = dom_tree.get_subtrees()  # Retrieve subtrees rooted at fork nodes
+print("Number of subtrees found:", len(subtrees))
+
+# remove trees whos root node does not lead to any text
+text_subtrees = [subtree for subtree in subtrees if subtree.root.leads_to_text]
+print("Number of subtrees that lead to text:", len(text_subtrees))
+
+direct_leaf_subtrees = [subtree for subtree in text_subtrees if subtree.root.has_direct_leaves]
+print("Number of subtrees with direct leaves beneath fork nodes:", len(direct_leaf_subtrees))
+
+for subtree in direct_leaf_subtrees:
+    print("Subtree rooted at:", subtree.root.value)
+    subtree.traverse(lambda node: print(node))
+# Index subtrees by structure and content
+# structure_index, content_index = index_subtrees(subtrees)
+
+# # Find matches based on structure
+# structure_matches = find_matching_subtrees(structure_index)
+# print("Structure-based matches found:", len(structure_matches))
+
+# # Print structure-based matches side by side
+# print_matches_side_by_side(structure_matches)
+
+# # Optionally, do the same for content-based matches if needed
+# content_matches = find_matching_subtrees(content_index)
+# print("Content-based matches found:", len(content_matches))
+# print_matches_side_by_side(content_matches)
+
+print(f"Time taken to build DOM tree: {time.time() - curr_time:.2f} seconds")
+
+# Optionally, traverse each subtree
+# for subtree in subtrees:
+#     print("Subtree rooted at:", subtree.root.value)
+#     subtree.traverse(lambda node: print(node))
+# Traverse the DOMTree and print each node
+# dom_tree.traverse(lambda node: print(node))
@@ -0,0 +1,34 @@
+from langchain_community.document_loaders import TextLoader
+from langchain_community.vectorstores import FAISS
+from langchain_openai import OpenAIEmbeddings
+from langchain_text_splitters import CharacterTextSplitter
+from langchain_community.document_loaders import AsyncHtmlLoader
+import time
+from scrapegraphai.asdt import DOMTree
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+openai_key = os.getenv("OPENAI_APIKEY")
+embeddings = OpenAIEmbeddings(api_key=openai_key)
+
+loader = AsyncHtmlLoader('https://perinim.github.io/projects/')
+document = loader.load()
+html_content = document[0].page_content
+
+curr_time = time.time()
+# Instantiate a DOMTree with HTML content
+dom_tree = DOMTree(html_content)
+text_nodes, metadata = dom_tree.collect_text_nodes()  # Collect text nodes for analysis
+
+print(f"Time taken to collect text nodes: {time.time() - curr_time}")
+
+db_texts = FAISS.from_texts(
+    texts=text_nodes,
+    embedding=embeddings,
+    metadatas=metadata
+)
+
+# Query for similar text
+query = "List me all the projects"
+