feat(gpt-4o): image to text single node test

PeriniM · PeriniM · commit 90955ca52f1e · 2024-05-14T11:43:21.000+02:00
diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py
@@ -19,7 +19,7 @@
 graph_config = {
     "llm": {
         "api_key": openai_key,
-        "model": "gpt-3.5-turbo",
+        "model": "gpt-4o",
     },
     "verbose": True,
     "headless": False,
@@ -30,7 +30,7 @@
 # ************************************************
 
 smart_scraper_graph = SmartScraperGraph(
-    prompt="List me all the projects with their description.",
+    prompt="List me all the projects with their description",
     # also accepts a string with the already downloaded HTML code
     source="https://perinim.github.io/projects/",
     config=graph_config
diff --git a/examples/single_node/image2text_node.py b/examples/single_node/image2text_node.py
@@ -0,0 +1,51 @@
+"""
+Example of ImageToTextNode
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.nodes import ImageToTextNode
+from scrapegraphai.models import OpenAIImageToText
+
+load_dotenv()
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("OPENAI_APIKEY")
+
+graph_config = {
+    "llm": {
+        "api_key": openai_key,
+        "model": "gpt-4o",
+        "temperature": 0,
+    },
+}
+
+# ************************************************
+# Define the node
+# ************************************************
+
+llm_model = OpenAIImageToText(graph_config["llm"])
+
+image_to_text_node = ImageToTextNode(
+    input="img_url",
+    output=["img_desc"],
+    node_config={
+        "llm_model": llm_model,
+        "headless": False
+    }
+)
+
+# ************************************************
+# Test the node
+# ************************************************
+
+state = {
+    "img_url": "https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/assets/scrapegraphai_logo.png?raw=true"
+}
+
+result = image_to_text_node.execute(state)
+
+print(result)
diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -18,6 +18,7 @@
         "gpt-4-0613": 8192,
         "gpt-4-32k": 32768,
         "gpt-4-32k-0613": 32768,
+        "gpt-4o": 128000,
     },
     "azure": {
         "gpt-3.5-turbo": 4096,
diff --git a/scrapegraphai/nodes/image_descriptor_node.py b/scrapegraphai/nodes/image_descriptor_node.py
@@ -0,0 +1,68 @@
+"""
+ImageDescriptorNode Module
+"""
+
+from typing import List, Optional
+from .base_node import BaseNode
+
+
+class ImageDescriptorNode(BaseNode):
+    """
+    Retrieve images from a list of URLs and return a description of the images using an image-to-text model.
+
+    Attributes:
+        llm_model: An instance of the language model client used for image-to-text conversion.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+
+    Args:
+        input (str): Boolean expression defining the input keys needed from the state.
+        output (List[str]): List of output keys to be updated in the state.
+        node_config (dict): Additional configuration for the node.
+        node_name (str): The unique identifier name for the node, defaulting to "ImageDescriptor".
+    """
+
+    def __init__(
+            self,
+            input: str,
+            output: List[str],
+            node_config: Optional[dict]=None,
+            node_name: str = "ImageDescriptor",
+        ):
+        super().__init__(node_name, "node", input, output, 1, node_config)
+
+        self.llm_model = node_config["llm_model"]
+        self.verbose = False if node_config is None else node_config.get("verbose", False)
+        self.max_images = 5 if node_config is None else node_config.get("max_images", 5)
+
+    def execute(self, state: dict) -> dict:
+        """
+        Generate text from an image using an image-to-text model. The method retrieves the image
+        from the list of URLs provided in the state and returns the extracted text.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used to fetch the
+                            correct data types from the state.
+
+        Returns:
+            dict: The updated state with the input key containing the text extracted from the image.
+        """
+
+        if self.verbose:
+            print(f"--- Executing {self.node_name} Node ---")
+            
+        input_keys = self.get_input_keys(state)
+        input_data = [state[key] for key in input_keys]
+        urls = input_data[0]
+
+        if len(urls) == 1 and not isinstance(urls, list):
+            urls = [urls]
+        elif len(urls) == 0:
+            return state
+
+        img_desc = []
+        for url in urls[:self.max_images]:
+            text_answer = self.llm_model.run(url)
+            img_desc.append(text_answer)
+
+        state.update({self.output[0]: img_desc})
+        return state