Skip to content

Commit f41a755

Browse files
authored
Merge pull request #356 from VinciGit00/321-integration-with-indexify
fixed pydantic schema
2 parents dd2b3a8 + 5d1fbf8 commit f41a755

File tree

9 files changed

+167
-36
lines changed

9 files changed

+167
-36
lines changed
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper with schema
3+
"""
4+
5+
import os, json
6+
from typing import List
7+
8+
from dotenv import load_dotenv
9+
load_dotenv()
10+
11+
from pydantic import BaseModel, Field
12+
from scrapegraphai.graphs import SmartScraperGraph
13+
from scrapegraphai.integrations import IndexifyNode
14+
15+
16+
# ************************************************
17+
# Define the output schema for the graph
18+
# ************************************************
19+
20+
class Image(BaseModel):
21+
url: str = Field(description="The url of the image")
22+
23+
class Images(BaseModel):
24+
images: List[Image]
25+
26+
# ************************************************
27+
# Define the configuration for the graph
28+
# ************************************************
29+
30+
openai_key = os.getenv("OPENAI_APIKEY")
31+
32+
graph_config = {
33+
"llm": {
34+
"api_key":openai_key,
35+
"model": "gpt-3.5-turbo",
36+
},
37+
"verbose": True,
38+
"headless": False,
39+
}
40+
41+
# ************************************************
42+
# Define the custom nodes for the graph
43+
# ************************************************
44+
45+
indexify_node = IndexifyNode(
46+
input="answer & img_urls",
47+
output=["is_indexed"],
48+
node_config={
49+
"verbose": True
50+
}
51+
)
52+
53+
# ************************************************
54+
# Create the SmartScraperGraph instance
55+
# ************************************************
56+
57+
smart_scraper_graph = SmartScraperGraph(
58+
prompt="List me all the images with their url",
59+
source="https://giphy.com/",
60+
schema=Images,
61+
config=graph_config
62+
)
63+
64+
# Add the custom node to the graph
65+
smart_scraper_graph.append_node(indexify_node)
66+
67+
# ************************************************
68+
# Run the SmartScraperGraph
69+
# ************************************************
70+
71+
result = smart_scraper_graph.run()
72+
print(json.dumps(result, indent=2))

requirements-dev.lock

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,6 @@ idna==3.7
185185
# via yarl
186186
imagesize==1.4.1
187187
# via sphinx
188-
importlib-metadata==7.1.0
189-
# via sphinx
190-
importlib-resources==6.4.0
191-
# via matplotlib
192188
iniconfig==2.0.0
193189
# via pytest
194190
jinja2==3.1.4
@@ -475,7 +471,6 @@ typing-extensions==4.12.0
475471
# via pyee
476472
# via sf-hamilton
477473
# via sqlalchemy
478-
# via starlette
479474
# via streamlit
480475
# via typer
481476
# via typing-inspect
@@ -507,6 +502,3 @@ win32-setctime==1.1.0
507502
# via loguru
508503
yarl==1.9.4
509504
# via aiohttp
510-
zipp==3.19.1
511-
# via importlib-metadata
512-
# via importlib-resources

scrapegraphai/integrations/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
Init file for integrations module
33
"""
44

5-
from .burr_bridge import BurrBridge
5+
from .burr_bridge import BurrBridge
6+
from .indexify_node import IndexifyNode
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
"""
2+
IndexifyNode Module
3+
"""
4+
5+
from typing import List, Optional
6+
7+
from ..utils.logging import get_logger
8+
from ..nodes.base_node import BaseNode
9+
10+
# try:
11+
# import indexify
12+
# except ImportError:
13+
# raise ImportError("indexify package is not installed. Please install it with 'pip install scrapegraphai[indexify]'")
14+
15+
16+
class IndexifyNode(BaseNode):
17+
"""
18+
A node responsible for indexing the content present in the state.
19+
20+
Attributes:
21+
verbose (bool): A flag indicating whether to show print statements during execution.
22+
23+
Args:
24+
input (str): Boolean expression defining the input keys needed from the state.
25+
output (List[str]): List of output keys to be updated in the state.
26+
node_config (dict): Additional configuration for the node.
27+
node_name (str): The unique identifier name for the node, defaulting to "Parse".
28+
"""
29+
30+
def __init__(
31+
self,
32+
input: str,
33+
output: List[str],
34+
node_config: Optional[dict] = None,
35+
node_name: str = "Indexify",
36+
):
37+
super().__init__(node_name, "node", input, output, 2, node_config)
38+
39+
self.verbose = (
40+
False if node_config is None else node_config.get("verbose", False)
41+
)
42+
43+
def execute(self, state: dict) -> dict:
44+
"""
45+
Executes the node's logic to index the content present in the state.
46+
47+
Args:
48+
state (dict): The current state of the graph. The input keys will be used to fetch the
49+
correct data from the state.
50+
51+
Returns:
52+
dict: The updated state with the output key containing the parsed content chunks.
53+
54+
Raises:
55+
KeyError: If the input keys are not found in the state, indicating that the
56+
necessary information for parsing the content is missing.
57+
"""
58+
59+
self.logger.info(f"--- Executing {self.node_name} Node ---")
60+
61+
# Interpret input keys based on the provided input expression
62+
# input_keys length matches the min_input_len parameter in the __init__ method
63+
# e.g. "answer & parsed_doc" or "answer | img_urls"
64+
65+
input_keys = self.get_input_keys(state)
66+
67+
# Fetching data from the state based on the input keys
68+
input_data = [state[key] for key in input_keys]
69+
70+
answer = input_data[0]
71+
img_urls = input_data[1]
72+
73+
# Indexify the content
74+
# ...
75+
76+
isIndexified = True
77+
state.update({self.output[0]: isIndexified})
78+
79+
return state

scrapegraphai/nodes/generate_answer_csv_node.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
# Imports from Langchain
1010
from langchain.prompts import PromptTemplate
11-
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
11+
from langchain_core.output_parsers import JsonOutputParser
1212
from langchain_core.runnables import RunnableParallel
1313
from tqdm import tqdm
1414

@@ -96,7 +96,7 @@ def execute(self, state):
9696

9797
# Initialize the output parser
9898
if self.node_config.get("schema", None) is not None:
99-
output_parser = PydanticOutputParser(pydantic_object=self.node_config.get("schema", None))
99+
output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
100100
else:
101101
output_parser = JsonOutputParser()
102102

@@ -150,9 +150,6 @@ def execute(self, state):
150150
single_chain = list(chains_dict.values())[0]
151151
answer = single_chain.invoke({"question": user_prompt})
152152

153-
if type(answer) == PydanticOutputParser:
154-
answer = answer.model_dump()
155-
156153
# Update the state with the generated answer
157154
state.update({self.output[0]: answer})
158155
return state

scrapegraphai/nodes/generate_answer_node.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,11 @@
77

88
# Imports from Langchain
99
from langchain.prompts import PromptTemplate
10-
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
10+
from langchain_core.output_parsers import JsonOutputParser
1111
from langchain_core.runnables import RunnableParallel
1212
from tqdm import tqdm
1313

14+
1415
from ..utils.logging import get_logger
1516
from ..models import Ollama
1617
# Imports from the library
@@ -81,8 +82,8 @@ def execute(self, state: dict) -> dict:
8182
doc = input_data[1]
8283

8384
# Initialize the output parser
84-
if self.node_config.get("schema",None) is not None:
85-
output_parser = PydanticOutputParser(pydantic_object=self.node_config.get("schema", None))
85+
if self.node_config.get("schema", None) is not None:
86+
output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
8687
else:
8788
output_parser = JsonOutputParser()
8889

@@ -129,9 +130,6 @@ def execute(self, state: dict) -> dict:
129130
single_chain = list(chains_dict.values())[0]
130131
answer = single_chain.invoke({"question": user_prompt})
131132

132-
if type(answer) == PydanticOutputParser:
133-
answer = answer.model_dump()
134-
135133
# Update the state with the generated answer
136134
state.update({self.output[0]: answer})
137135
return state

scrapegraphai/nodes/generate_answer_omni_node.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
# Imports from Langchain
99
from langchain.prompts import PromptTemplate
10-
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
10+
from langchain_core.output_parsers import JsonOutputParser
1111
from langchain_core.runnables import RunnableParallel
1212
from tqdm import tqdm
1313
from ..models import Ollama
@@ -82,7 +82,7 @@ def execute(self, state: dict) -> dict:
8282

8383
# Initialize the output parser
8484
if self.node_config.get("schema", None) is not None:
85-
output_parser = PydanticOutputParser(pydantic_object=self.node_config.get("schema", None))
85+
output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
8686
else:
8787
output_parser = JsonOutputParser()
8888

@@ -141,9 +141,6 @@ def execute(self, state: dict) -> dict:
141141
single_chain = list(chains_dict.values())[0]
142142
answer = single_chain.invoke({"question": user_prompt})
143143

144-
if type(answer) == PydanticOutputParser:
145-
answer = answer.model_dump()
146-
147144
# Update the state with the generated answer
148145
state.update({self.output[0]: answer})
149146
return state

scrapegraphai/nodes/generate_answer_pdf_node.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
# Imports from Langchain
99
from langchain.prompts import PromptTemplate
10-
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
10+
from langchain_core.output_parsers import JsonOutputParser
1111
from langchain_core.runnables import RunnableParallel
1212
from tqdm import tqdm
1313
from ..models import Ollama
@@ -96,8 +96,8 @@ def execute(self, state):
9696
doc = input_data[1]
9797

9898
# Initialize the output parser
99-
if self.node_config.get("schema",None) is not None:
100-
output_parser = PydanticOutputParser(pydantic_object=self.node_config.get("schema", None))
99+
if self.node_config.get("schema", None) is not None:
100+
output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
101101
else:
102102
output_parser = JsonOutputParser()
103103

scrapegraphai/nodes/merge_answers_node.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
# Imports from Langchain
1010
from langchain.prompts import PromptTemplate
11-
from langchain_core.output_parsers import JsonOutputParser, PydanticOutputParser
11+
from langchain_core.output_parsers import JsonOutputParser
1212
from tqdm import tqdm
1313

1414
from ..utils.logging import get_logger
@@ -80,10 +80,8 @@ def execute(self, state: dict) -> dict:
8080
answers_str += f"CONTENT WEBSITE {i+1}: {answer}\n"
8181

8282
# Initialize the output parser
83-
if self.node_config["schema"] is not None:
84-
output_parser = PydanticOutputParser(
85-
pydantic_object=self.node_config["schema"]
86-
)
83+
if self.node_config.get("schema", None) is not None:
84+
output_parser = JsonOutputParser(pydantic_object=self.node_config["schema"])
8785
else:
8886
output_parser = JsonOutputParser()
8987

@@ -111,9 +109,6 @@ def execute(self, state: dict) -> dict:
111109
merge_chain = prompt_template | self.llm_model | output_parser
112110
answer = merge_chain.invoke({"user_prompt": user_prompt})
113111

114-
if type(answer) == PydanticOutputParser:
115-
answer = answer.model_dump()
116-
117112
# Update the state with the generated answer
118113
state.update({self.output[0]: answer})
119114
return state

0 commit comments

Comments
 (0)