Skip to content

Commit 4d42d7b

Browse files
committed
add example
1 parent f5cbd80 commit 4d42d7b

File tree

5 files changed

+179
-11
lines changed

5 files changed

+179
-11
lines changed
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Module for showing how PDFScraper multi works
3+
"""
4+
import os
5+
from scrapegraphai.graphs import PdfScraperMultiGraph
6+
7+
graph_config = {
8+
"llm": {
9+
"model": "ollama/llama3",
10+
"temperature": 0,
11+
"format": "json", # Ollama needs the format to be specified explicitly
12+
"model_tokens": 4000,
13+
},
14+
"embeddings": {
15+
"model": "ollama/nomic-embed-text",
16+
"temperature": 0,
17+
},
18+
"verbose": True,
19+
"headless": False,
20+
}
21+
FILE_NAME = "inputs/example.json"
22+
curr_dir = os.path.dirname(os.path.realpath(__file__))
23+
file_path = os.path.join(curr_dir, FILE_NAME)
24+
25+
with open(file_path, 'r', encoding="utf-8") as file:
26+
text = file.read()
27+
28+
29+
json_scraper_graph = JSONScraperGraph(
30+
prompt="List me all the authors, title and genres of the books",
31+
source=text, # Pass the content of the file, not the file object
32+
config=graph_config
33+
)
34+
35+
36+
37+
results = []
38+
for source in sources:
39+
pdf_scraper_graph = PdfScraperMultiGraph(
40+
prompt=prompt,
41+
source=source,
42+
config=graph_config
43+
)
44+
result = pdf_scraper_graph.run()
45+
results.append(result)
46+
47+
print(results)

examples/local_models/pdf_scraper_multi_ollama.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
Module for showing how PDFScraper multi works
33
"""
4+
import json
45
from scrapegraphai.graphs import PdfScraperMultiGraph
56

67
graph_config = {
@@ -56,14 +57,16 @@
5657
Dependent Variable (DV): Mental health outcomes.
5758
Exogenous Shock: staggered introduction of Facebook across U.S. colleges.
5859
"""
59-
results = []
60-
for source in sources:
61-
pdf_scraper_graph = PdfScraperMultiGraph(
62-
prompt=prompt,
63-
source=source,
64-
config=graph_config
65-
)
66-
result = pdf_scraper_graph.run()
67-
results.append(result)
60+
# *******************************************************
61+
# Create the SmartScraperMultiGraph instance and run it
62+
# *******************************************************
6863

69-
print(results)
64+
multiple_search_graph = PdfScraperMultiGraph(
65+
prompt=prompt,
66+
source= sources,
67+
schema=None,
68+
config=graph_config
69+
)
70+
71+
result = multiple_search_graph.run()
72+
print(json.dumps(result, indent=4))

examples/openai/smart_scraper_multi_openai.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
Basic example of scraping pipeline using SmartScraper
33
"""
44

5-
import os, json
5+
import os
6+
import json
67
from dotenv import load_dotenv
78
from scrapegraphai.graphs import SmartScraperMultiGraph
89

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@
1717
from .omni_search_graph import OmniSearchGraph
1818
from .smart_scraper_multi_graph import SmartScraperMultiGraph
1919
from .pdf_scraper_multi import PdfScraperMultiGraph
20+
from .json_scraper_multi import JsonScraperMultiGraph
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
"""
2+
JsonScraperMultiGraph Module
3+
"""
4+
5+
from copy import copy, deepcopy
6+
from typing import List, Optional
7+
8+
from .base_graph import BaseGraph
9+
from .abstract_graph import AbstractGraph
10+
from .json_scraper_graph import JSONScraperGraph
11+
12+
from ..nodes import (
13+
GraphIteratorNode,
14+
MergeAnswersNode
15+
)
16+
17+
18+
class JsonScraperMultiGraph(AbstractGraph):
19+
"""
20+
JsonScraperMultiGraph is a scraping pipeline that scrapes a list of URLs and generates answers to a given prompt.
21+
It only requires a user prompt and a list of URLs.
22+
23+
Attributes:
24+
prompt (str): The user prompt to search the internet.
25+
llm_model (dict): The configuration for the language model.
26+
embedder_model (dict): The configuration for the embedder model.
27+
headless (bool): A flag to run the browser in headless mode.
28+
verbose (bool): A flag to display the execution information.
29+
model_token (int): The token limit for the language model.
30+
31+
Args:
32+
prompt (str): The user prompt to search the internet.
33+
source (List[str]): The source of the graph.
34+
config (dict): Configuration parameters for the graph.
35+
schema (Optional[str]): The schema for the graph output.
36+
37+
Example:
38+
>>> search_graph = MultipleSearchGraph(
39+
... "What is Chioggia famous for?",
40+
... {"llm": {"model": "gpt-3.5-turbo"}}
41+
... )
42+
>>> result = search_graph.run()
43+
"""
44+
45+
def __init__(self, prompt: str, source: List[str], config: dict, schema: Optional[str] = None):
46+
47+
self.max_results = config.get("max_results", 3)
48+
49+
if all(isinstance(value, str) for value in config.values()):
50+
self.copy_config = copy(config)
51+
else:
52+
self.copy_config = deepcopy(config)
53+
54+
super().__init__(prompt, config, source, schema)
55+
56+
def _create_graph(self) -> BaseGraph:
57+
"""
58+
Creates the graph of nodes representing the workflow for web scraping and searching.
59+
60+
Returns:
61+
BaseGraph: A graph instance representing the web scraping and searching workflow.
62+
"""
63+
64+
# ************************************************
65+
# Create a SmartScraperGraph instance
66+
# ************************************************
67+
68+
smart_scraper_instance = JSONScraperGraph(
69+
prompt="",
70+
source="",
71+
config=self.copy_config,
72+
)
73+
74+
# ************************************************
75+
# Define the graph nodes
76+
# ************************************************
77+
78+
graph_iterator_node = GraphIteratorNode(
79+
input="user_prompt & jsons",
80+
output=["results"],
81+
node_config={
82+
"graph_instance": smart_scraper_instance,
83+
}
84+
)
85+
86+
merge_answers_node = MergeAnswersNode(
87+
input="user_prompt & results",
88+
output=["answer"],
89+
node_config={
90+
"llm_model": self.llm_model,
91+
"schema": self.schema
92+
}
93+
)
94+
95+
return BaseGraph(
96+
nodes=[
97+
graph_iterator_node,
98+
merge_answers_node,
99+
],
100+
edges=[
101+
(graph_iterator_node, merge_answers_node),
102+
],
103+
entry_point=graph_iterator_node
104+
)
105+
106+
def run(self) -> str:
107+
"""
108+
Executes the web scraping and searching process.
109+
110+
Returns:
111+
str: The answer to the prompt.
112+
"""
113+
inputs = {"user_prompt": self.prompt, "jsons": self.source}
114+
self.final_state, self.execution_info = self.graph.execute(inputs)
115+
116+
return self.final_state.get("answer", "No answer found.")

0 commit comments

Comments
 (0)