Skip to content

Commit 90955ca

Browse files
committed
feat(gpt-4o): image to text single node test
1 parent d2877d8 commit 90955ca

File tree

4 files changed

+122
-2
lines changed

4 files changed

+122
-2
lines changed

examples/openai/smart_scraper_openai.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
graph_config = {
2020
"llm": {
2121
"api_key": openai_key,
22-
"model": "gpt-3.5-turbo",
22+
"model": "gpt-4o",
2323
},
2424
"verbose": True,
2525
"headless": False,
@@ -30,7 +30,7 @@
3030
# ************************************************
3131

3232
smart_scraper_graph = SmartScraperGraph(
33-
prompt="List me all the projects with their description.",
33+
prompt="List me all the projects with their description",
3434
# also accepts a string with the already downloaded HTML code
3535
source="https://perinim.github.io/projects/",
3636
config=graph_config
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""
2+
Example of ImageToTextNode
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.nodes import ImageToTextNode
8+
from scrapegraphai.models import OpenAIImageToText
9+
10+
load_dotenv()
11+
12+
# ************************************************
13+
# Define the configuration for the graph
14+
# ************************************************
15+
16+
openai_key = os.getenv("OPENAI_APIKEY")
17+
18+
graph_config = {
19+
"llm": {
20+
"api_key": openai_key,
21+
"model": "gpt-4o",
22+
"temperature": 0,
23+
},
24+
}
25+
26+
# ************************************************
27+
# Define the node
28+
# ************************************************
29+
30+
llm_model = OpenAIImageToText(graph_config["llm"])
31+
32+
image_to_text_node = ImageToTextNode(
33+
input="img_url",
34+
output=["img_desc"],
35+
node_config={
36+
"llm_model": llm_model,
37+
"headless": False
38+
}
39+
)
40+
41+
# ************************************************
42+
# Test the node
43+
# ************************************************
44+
45+
state = {
46+
"img_url": "https://github.com/VinciGit00/Scrapegraph-ai/blob/main/docs/assets/scrapegraphai_logo.png?raw=true"
47+
}
48+
49+
result = image_to_text_node.execute(state)
50+
51+
print(result)

scrapegraphai/helpers/models_tokens.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"gpt-4-0613": 8192,
1919
"gpt-4-32k": 32768,
2020
"gpt-4-32k-0613": 32768,
21+
"gpt-4o": 128000,
2122
},
2223
"azure": {
2324
"gpt-3.5-turbo": 4096,
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""
2+
ImageDescriptorNode Module
3+
"""
4+
5+
from typing import List, Optional
6+
from .base_node import BaseNode
7+
8+
9+
class ImageDescriptorNode(BaseNode):
10+
"""
11+
Retrieve images from a list of URLs and return a description of the images using an image-to-text model.
12+
13+
Attributes:
14+
llm_model: An instance of the language model client used for image-to-text conversion.
15+
verbose (bool): A flag indicating whether to show print statements during execution.
16+
17+
Args:
18+
input (str): Boolean expression defining the input keys needed from the state.
19+
output (List[str]): List of output keys to be updated in the state.
20+
node_config (dict): Additional configuration for the node.
21+
node_name (str): The unique identifier name for the node, defaulting to "ImageDescriptor".
22+
"""
23+
24+
def __init__(
25+
self,
26+
input: str,
27+
output: List[str],
28+
node_config: Optional[dict]=None,
29+
node_name: str = "ImageDescriptor",
30+
):
31+
super().__init__(node_name, "node", input, output, 1, node_config)
32+
33+
self.llm_model = node_config["llm_model"]
34+
self.verbose = False if node_config is None else node_config.get("verbose", False)
35+
self.max_images = 5 if node_config is None else node_config.get("max_images", 5)
36+
37+
def execute(self, state: dict) -> dict:
38+
"""
39+
Generate text from an image using an image-to-text model. The method retrieves the image
40+
from the list of URLs provided in the state and returns the extracted text.
41+
42+
Args:
43+
state (dict): The current state of the graph. The input keys will be used to fetch the
44+
correct data types from the state.
45+
46+
Returns:
47+
dict: The updated state with the input key containing the text extracted from the image.
48+
"""
49+
50+
if self.verbose:
51+
print(f"--- Executing {self.node_name} Node ---")
52+
53+
input_keys = self.get_input_keys(state)
54+
input_data = [state[key] for key in input_keys]
55+
urls = input_data[0]
56+
57+
if len(urls) == 1 and not isinstance(urls, list):
58+
urls = [urls]
59+
elif len(urls) == 0:
60+
return state
61+
62+
img_desc = []
63+
for url in urls[:self.max_images]:
64+
text_answer = self.llm_model.run(url)
65+
img_desc.append(text_answer)
66+
67+
state.update({self.output[0]: img_desc})
68+
return state

0 commit comments

Comments
 (0)