Skip to content

Commit d3e2eb6

Browse files
committed
add new benchmarks
1 parent 13669cc commit d3e2eb6

17 files changed

+721
-2
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Local models
2+
The two websites benchmark are:
3+
- Example 1: https://perinim.github.io/projects
4+
- Example 2: https://www.wired.com (at 17/4/2024)
5+
6+
Both are strored locally as txt file in .txt format because in this way we do not have to think about the internet connection
7+
8+
The time is measured in seconds
9+
10+
The model runned for this benchmark is Mistral on Ollama with nomic-embed-text
11+
12+
| Hardware | Example 1 | Example 2 |
13+
| ------------------ | --------- | --------- |
14+
| Macbook 14' m1 pro | 30.54 | 35.76 |
15+
| Macbook m2 max | | |
16+
17+
18+
**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following:
19+
20+
| Hardware | Example 1 | Example 2 |
21+
| ------------------ | --------- | --------- |
22+
| Macbook 14' m1 pro | | |
23+
# Performance on APIs services
24+
### Example 1: personal portfolio
25+
**URL**: https://perinim.github.io/projects
26+
**Task**: List me all the projects with their description.
27+
28+
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
29+
| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
30+
| gpt-3.5-turbo | 24.215268 | 1892 | 1802 | 90 | 1 | 0.002883 |
31+
| gpt-4-turbo-preview | 6.614 | 1936 | 1802 | 134 | 1 | 0.02204 |
32+
33+
### Example 2: Wired
34+
**URL**: https://www.wired.com
35+
**Task**: List me all the articles with their description.
36+
37+
| Name | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
38+
| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
39+
| gpt-3.5-turbo | | | | | | |
40+
| gpt-4-turbo-preview | | | | | | |
41+

examples/benchmarks/GenerateScraper/benchmark_docker.py

Whitespace-only changes.
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper from text
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import ScriptCreatorGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the text file
13+
# ************************************************
14+
files = ["inputs/example_1.txt", "inputs/example_2.txt"]
15+
tasks = ["List me all the projects with their description.",
16+
"List me all the articles with their description."]
17+
18+
# ************************************************
19+
# Define the configuration for the graph
20+
# ************************************************
21+
22+
openai_key = os.getenv("GPT4_KEY")
23+
24+
25+
graph_config = {
26+
"llm": {
27+
"model": "ollama/mistral",
28+
"temperature": 0,
29+
# "model_tokens": 2000, # set context length arbitrarily,
30+
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
31+
},
32+
"embeddings": {
33+
"model": "ollama/nomic-embed-text",
34+
"temperature": 0,
35+
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
36+
},
37+
"library": "beautifoulsoup"
38+
}
39+
40+
41+
# ************************************************
42+
# Create the SmartScraperGraph instance and run it
43+
# ************************************************
44+
45+
for i in range(0, 2):
46+
with open(files[i], 'r', encoding="utf-8") as file:
47+
text = file.read()
48+
49+
smart_scraper_graph = ScriptCreatorGraph(
50+
prompt=tasks[i],
51+
source=text,
52+
config=graph_config
53+
)
54+
55+
result = smart_scraper_graph.run()
56+
print(result)
57+
# ************************************************
58+
# Get graph execution info
59+
# ************************************************
60+
61+
graph_exec_info = smart_scraper_graph.get_execution_info()
62+
print(prettify_exec_info(graph_exec_info))
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper from text
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import ScriptCreatorGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the text file
13+
# ************************************************
14+
files = ["inputs/example_1.txt", "inputs/example_2.txt"]
15+
tasks = ["List me all the projects with their description.",
16+
"List me all the articles with their description."]
17+
18+
# ************************************************
19+
# Define the configuration for the graph
20+
# ************************************************
21+
22+
openai_key = os.getenv("GPT35_KEY")
23+
24+
graph_config = {
25+
"llm": {
26+
"api_key": openai_key,
27+
"model": "gpt-3.5-turbo",
28+
},
29+
"library": "beautifoulsoup"
30+
}
31+
32+
# ************************************************
33+
# Create the SmartScraperGraph instance and run it
34+
# ************************************************
35+
36+
for i in range(0, 2):
37+
with open(files[i], 'r', encoding="utf-8") as file:
38+
text = file.read()
39+
40+
smart_scraper_graph = ScriptCreatorGraph(
41+
prompt=tasks[i],
42+
source=text,
43+
config=graph_config
44+
)
45+
46+
result = smart_scraper_graph.run()
47+
print(result)
48+
# ************************************************
49+
# Get graph execution info
50+
# ************************************************
51+
52+
graph_exec_info = smart_scraper_graph.get_execution_info()
53+
print(prettify_exec_info(graph_exec_info))
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper from text
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import ScriptCreatorGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
load_dotenv()
10+
11+
# ************************************************
12+
# Read the text file
13+
# ************************************************
14+
files = ["inputs/example_1.txt", "inputs/example_2.txt"]
15+
tasks = ["List me all the projects with their description.",
16+
"List me all the articles with their description."]
17+
18+
# ************************************************
19+
# Define the configuration for the graph
20+
# ************************************************
21+
22+
openai_key = os.getenv("GPT4_KEY")
23+
24+
graph_config = {
25+
"llm": {
26+
"api_key": openai_key,
27+
"model": "gpt-4-turbo-preview",
28+
},
29+
"library": "beautifoulsoup"
30+
}
31+
32+
# ************************************************
33+
# Create the SmartScraperGraph instance and run it
34+
# ************************************************
35+
36+
for i in range(0, 2):
37+
with open(files[i], 'r', encoding="utf-8") as file:
38+
text = file.read()
39+
40+
smart_scraper_graph = ScriptCreatorGraph(
41+
prompt=tasks[i],
42+
source=text,
43+
config=graph_config
44+
)
45+
46+
result = smart_scraper_graph.run()
47+
print(result)
48+
# ************************************************
49+
# Get graph execution info
50+
# ************************************************
51+
52+
graph_exec_info = smart_scraper_graph.get_execution_info()
53+
print(prettify_exec_info(graph_exec_info))
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
OPENAI_APIKEY="your openai api key"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
This folder contains all the scripts used for benchmarks
2+
Remember if you use openai to set the keys or if you use Ollama/Docker to set the setup

0 commit comments

Comments
 (0)