ScrapeGraphAI
diff --git a/‎examples/openai/custom_graph_openai copy.py‎
Lines changed: 113 additions & 0 deletions b/‎examples/openai/custom_graph_openai copy.py‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎examples/openai/omni_scraper_openai.py‎
Lines changed: 47 additions & 0 deletions b/‎examples/openai/omni_scraper_openai.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎examples/single_node/image2text_node.py‎
Lines changed: 4 additions & 1 deletion b/‎examples/single_node/image2text_node.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎scrapegraphai/graphs/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎scrapegraphai/graphs/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎scrapegraphai/graphs/omni_scraper_graph.py‎
Lines changed: 130 additions & 0 deletions b/‎scrapegraphai/graphs/omni_scraper_graph.py‎
Lines changed: 130 additions & 0 deletions
diff --git a/‎scrapegraphai/nodes/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎scrapegraphai/nodes/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎scrapegraphai/nodes/fetch_node.py‎
Lines changed: 12 additions & 6 deletions b/‎scrapegraphai/nodes/fetch_node.py‎
Lines changed: 12 additions & 6 deletions
@@ -0,0 +1,113 @@
+"""
+Example of custom graph using existing nodes
+"""
+
+import os
+from dotenv import load_dotenv
+
+from langchain_openai import OpenAIEmbeddings
+from scrapegraphai.models import OpenAI, OpenAIImageToText
+from scrapegraphai.graphs import BaseGraph
+from scrapegraphai.nodes import FetchNode, ParseNode, ImageToTextNode, RAGNode, GenerateAnswerOmniNode
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": openai_key,
+        "model": "gpt-4o",
+        "temperature": 0,
+        "streaming": False
+    },
+}
+
+# ************************************************
+# Define the graph nodes
+# ************************************************
+
+llm_model = OpenAI(graph_config["llm"])
+iit_model = OpenAIImageToText(graph_config["llm"])
+embedder = OpenAIEmbeddings(api_key=llm_model.openai_api_key)
+
+# define the nodes for the graph
+
+fetch_node = FetchNode(
+    input="url | local_dir",
+    output=["doc", "link_urls", "img_urls"],
+    node_config={
+        "verbose": True,
+        "headless": True,
+    }
+)
+parse_node = ParseNode(
+    input="doc",
+    output=["parsed_doc"],
+    node_config={
+        "chunk_size": 4096,
+        "verbose": True,
+    }
+)
+image_to_text_node = ImageToTextNode(
+    input="img_urls",
+    output=["img_desc"],
+    node_config={
+        "llm_model": iit_model,
+        "max_images": 4,
+    }
+)
+rag_node = RAGNode(
+    input="user_prompt & (parsed_doc | doc)",
+    output=["relevant_chunks"],
+    node_config={
+        "llm_model": llm_model,
+        "embedder_model": embedder,
+        "verbose": True,
+    }
+)
+generate_answer_omni_node = GenerateAnswerOmniNode(
+    input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc",
+    output=["answer"],
+    node_config={
+        "llm_model": llm_model,
+        "verbose": True,
+    }
+)
+
+# ************************************************
+# Create the graph by defining the connections
+# ************************************************
+
+graph = BaseGraph(
+    nodes=[
+        fetch_node,
+        parse_node,
+        image_to_text_node,
+        rag_node,
+        generate_answer_omni_node,
+    ],
+    edges=[
+        (fetch_node, parse_node),
+        (parse_node, image_to_text_node),
+        (image_to_text_node, rag_node),
+        (rag_node, generate_answer_omni_node)
+    ],
+    entry_point=fetch_node
+)
+
+# ************************************************
+# Execute the graph
+# ************************************************
+
+result, execution_info = graph.execute({
+    "user_prompt": "List me all the projects with their titles and image links and descriptions.",
+    "url": "https://perinim.github.io/projects/"
+})
+
+# get the answer from the result
+result = result.get("answer", "No answer found.")
+print(result)
@@ -0,0 +1,47 @@
+""" 
+Basic example of scraping pipeline using OmniScraper
+"""
+
+import os, json
+from dotenv import load_dotenv
+from scrapegraphai.graphs import OmniScraperGraph
+from scrapegraphai.utils import prettify_exec_info, convert_to_csv
+
+load_dotenv()
+
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": openai_key,
+        "model": "gpt-4o",
+    },
+    "verbose": True,
+    "headless": False,
+}
+
+# ************************************************
+# Create the OmniScraperGraph instance and run it
+# ************************************************
+
+omni_scraper_graph = OmniScraperGraph(
+    prompt="List me all the projects with their titles and image links and descriptions.",
+    # also accepts a string with the already downloaded HTML code
+    source="https://perinim.github.io/projects/",
+    config=graph_config
+)
+
+result = omni_scraper_graph.run()
+print(json.dumps(result, indent=2))
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = omni_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
@@ -43,7 +43,10 @@
 # ************************************************
 
 state = {
-    "img_url": "https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/assets/scrapegraphai_logo.png?raw=true"
+    "img_url": [
+        "https://perinim.github.io/assets/img/rotary_pybullet.jpg",
+        "https://perinim.github.io/assets/img/value-policy-heatmaps.jpg",
+    ],
 }
 
 result = image_to_text_node.execute(state)
 
@@ -13,3 +13,4 @@
 from .json_scraper_graph import JSONScraperGraph
 from .csv_scraper_graph import CSVScraperGraph
 from .pdf_scraper_graph import PDFScraperGraph
+from .omni_scraper_graph import OmniScraperGraph
@@ -0,0 +1,130 @@
+"""
+OmniScraperGraph Module
+"""
+
+from .base_graph import BaseGraph
+from ..nodes import (
+    FetchNode,
+    ParseNode,
+    ImageToTextNode,
+    RAGNode,
+    GenerateAnswerOmniNode
+)
+from scrapegraphai.models import OpenAIImageToText
+from .abstract_graph import AbstractGraph
+
+
+class OmniScraperGraph(AbstractGraph):
+    """
+    OmniScraper is a scraping pipeline that automates the process of 
+    extracting information from web pages
+    using a natural language model to interpret and answer prompts.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, 
+        configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+
+    Example:
+        >>> omni_scraper = OmniScraperGraph(
+        ...     "List me all the attractions in Chioggia and describe their pictures.",
+        ...     "https://en.wikipedia.org/wiki/Chioggia",
+        ...     {"llm": {"model": "gpt-4o"}}
+        ... )
+        >>> result = omni_scraper.run()
+        )
+    """
+
+    def __init__(self, prompt: str, source: str, config: dict):
+
+        self.max_images = 5 if config is None else config.get("max_images", 5)
+        
+        super().__init__(prompt, config, source)
+
+        self.input_key = "url" if source.startswith("http") else "local_dir"
+        
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
+        """
+        fetch_node = FetchNode(
+            input="url | local_dir",
+            output=["doc", "link_urls", "img_urls"],
+            node_config={
+                "loader_kwargs": self.config.get("loader_kwargs", {}),
+            }
+        )
+        parse_node = ParseNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={
+                "chunk_size": self.model_token
+            }
+        )
+        image_to_text_node = ImageToTextNode(
+            input="img_urls",
+            output=["img_desc"],
+            node_config={
+                "llm_model": OpenAIImageToText(self.config["llm"]),
+                "max_images": self.max_images
+            }
+        )
+        rag_node = RAGNode(
+            input="user_prompt & (parsed_doc | doc)",
+            output=["relevant_chunks"],
+            node_config={
+                "llm_model": self.llm_model,
+                "embedder_model": self.embedder_model
+            }
+        )
+        generate_answer_omni_node = GenerateAnswerOmniNode(
+            input="user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc",
+            output=["answer"],
+            node_config={
+                "llm_model": self.llm_model
+            }
+        )
+
+        return BaseGraph(
+            nodes=[
+                fetch_node,
+                parse_node,
+                image_to_text_node,
+                rag_node,
+                generate_answer_omni_node,
+            ],
+            edges=[
+                (fetch_node, parse_node),
+                (parse_node, image_to_text_node),
+                (image_to_text_node, rag_node),
+                (rag_node, generate_answer_omni_node)
+            ],
+            entry_point=fetch_node
+        )
+
+    def run(self) -> str:
+        """
+        Executes the scraping process and returns the answer to the prompt.
+
+        Returns:
+            str: The answer to the prompt.
+        """
+
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("answer", "No answer found.")
@@ -18,4 +18,5 @@
 from .generate_answer_csv_node import GenerateAnswerCSVNode
 from .generate_answer_pdf_node import GenerateAnswerPDFNode
 from .graph_iterator_node import GraphIteratorNode
-from .merge_answers_node import MergeAnswersNode
+from .merge_answers_node import MergeAnswersNode
+from .generate_answer_omni_node import GenerateAnswerOmniNode
@@ -118,15 +118,18 @@ def execute(self, state):
             pass
 
         elif not source.startswith("http"):
-            compressed_document = [Document(page_content=cleanup_html(data, source),
+            title, minimized_body, link_urls, image_urls = cleanup_html(source, source)
+            parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
+            compressed_document = [Document(page_content=parsed_content,
                                             metadata={"source": "local_dir"}
                                            )]
 
         elif self.useSoup:
             response = requests.get(source)
             if response.status_code == 200:
-                cleanedup_html = cleanup_html(response.text, source)
-                compressed_document = [Document(page_content=cleanedup_html)]
+                title, minimized_body, link_urls, image_urls = cleanup_html(response.text, source)
+                parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
+                compressed_document = [Document(page_content=parsed_content)]
             else:	
                 print(f"Failed to retrieve contents from the webpage at url: {source}")
 
@@ -137,11 +140,14 @@ def execute(self, state):
                 loader_kwargs = self.node_config.get("loader_kwargs", {})
 
             loader = ChromiumLoader([source], headless=self.headless, **loader_kwargs)
-
             document = loader.load()
+            
+            title, minimized_body, link_urls, image_urls = cleanup_html(str(document[0].page_content), source)
+            parsed_content = f"Title: {title}, Body: {minimized_body}, Links: {link_urls}, Images: {image_urls}"
+            
             compressed_document = [
-                Document(page_content=cleanup_html(str(document[0].page_content), source), metadata={"source": source})
+                Document(page_content=parsed_content, metadata={"source": source})
             ]
 
-        state.update({self.output[0]: compressed_document})
+        state.update({self.output[0]: compressed_document, self.output[1]: link_urls, self.output[2]: image_urls})
         return state