Skip to content

Commit 5d1fbf8

Browse files
committed
feat(indexify-node): add example
1 parent 5c9843f commit 5d1fbf8

File tree

3 files changed

+153
-1
lines changed

3 files changed

+153
-1
lines changed
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper with schema
3+
"""
4+
5+
import os, json
6+
from typing import List
7+
8+
from dotenv import load_dotenv
9+
load_dotenv()
10+
11+
from pydantic import BaseModel, Field
12+
from scrapegraphai.graphs import SmartScraperGraph
13+
from scrapegraphai.integrations import IndexifyNode
14+
15+
16+
# ************************************************
17+
# Define the output schema for the graph
18+
# ************************************************
19+
20+
class Image(BaseModel):
21+
url: str = Field(description="The url of the image")
22+
23+
class Images(BaseModel):
24+
images: List[Image]
25+
26+
# ************************************************
27+
# Define the configuration for the graph
28+
# ************************************************
29+
30+
openai_key = os.getenv("OPENAI_APIKEY")
31+
32+
graph_config = {
33+
"llm": {
34+
"api_key":openai_key,
35+
"model": "gpt-3.5-turbo",
36+
},
37+
"verbose": True,
38+
"headless": False,
39+
}
40+
41+
# ************************************************
42+
# Define the custom nodes for the graph
43+
# ************************************************
44+
45+
indexify_node = IndexifyNode(
46+
input="answer & img_urls",
47+
output=["is_indexed"],
48+
node_config={
49+
"verbose": True
50+
}
51+
)
52+
53+
# ************************************************
54+
# Create the SmartScraperGraph instance
55+
# ************************************************
56+
57+
smart_scraper_graph = SmartScraperGraph(
58+
prompt="List me all the images with their url",
59+
source="https://giphy.com/",
60+
schema=Images,
61+
config=graph_config
62+
)
63+
64+
# Add the custom node to the graph
65+
smart_scraper_graph.append_node(indexify_node)
66+
67+
# ************************************************
68+
# Run the SmartScraperGraph
69+
# ************************************************
70+
71+
result = smart_scraper_graph.run()
72+
print(json.dumps(result, indent=2))

scrapegraphai/integrations/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
Init file for integrations module
33
"""
44

5-
from .burr_bridge import BurrBridge
5+
from .burr_bridge import BurrBridge
6+
from .indexify_node import IndexifyNode
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""
2+
IndexifyNode Module
3+
"""
4+
5+
from typing import List, Optional
6+
7+
from ..utils.logging import get_logger
8+
from ..nodes.base_node import BaseNode
9+
10+
# try:
11+
# import indexify
12+
# except ImportError:
13+
# raise ImportError("indexify package is not installed. Please install it with 'pip install scrapegraphai[indexify]'")
14+
15+
16+
class IndexifyNode(BaseNode):
17+
"""
18+
A node responsible for indexing the content present in the state.
19+
20+
Attributes:
21+
verbose (bool): A flag indicating whether to show print statements during execution.
22+
23+
Args:
24+
input (str): Boolean expression defining the input keys needed from the state.
25+
output (List[str]): List of output keys to be updated in the state.
26+
node_config (dict): Additional configuration for the node.
27+
node_name (str): The unique identifier name for the node, defaulting to "Parse".
28+
"""
29+
30+
def __init__(
31+
self,
32+
input: str,
33+
output: List[str],
34+
node_config: Optional[dict] = None,
35+
node_name: str = "Indexify",
36+
):
37+
super().__init__(node_name, "node", input, output, 2, node_config)
38+
39+
self.verbose = (
40+
False if node_config is None else node_config.get("verbose", False)
41+
)
42+
43+
def execute(self, state: dict) -> dict:
44+
"""
45+
Executes the node's logic to index the content present in the state.
46+
47+
Args:
48+
state (dict): The current state of the graph. The input keys will be used to fetch the
49+
correct data from the state.
50+
51+
Returns:
52+
dict: The updated state with the output key containing the parsed content chunks.
53+
54+
Raises:
55+
KeyError: If the input keys are not found in the state, indicating that the
56+
necessary information for parsing the content is missing.
57+
"""
58+
59+
self.logger.info(f"--- Executing {self.node_name} Node ---")
60+
61+
# Interpret input keys based on the provided input expression
62+
# input_keys length matches the min_input_len parameter in the __init__ method
63+
# e.g. "answer & parsed_doc" or "answer | img_urls"
64+
65+
input_keys = self.get_input_keys(state)
66+
67+
# Fetching data from the state based on the input keys
68+
input_data = [state[key] for key in input_keys]
69+
70+
answer = input_data[0]
71+
img_urls = input_data[1]
72+
73+
# Indexify the content
74+
# ...
75+
76+
isIndexified = True
77+
state.update({self.output[0]: isIndexified})
78+
79+
return state

0 commit comments

Comments
 (0)