ScrapeGraphAI
diff --git a/‎examples/benchmarks/.env.example‎ renamed to ‎examples/benchmarks/GenerateScraper/.env.example‎ b/‎examples/benchmarks/.env.example‎ renamed to ‎examples/benchmarks/GenerateScraper/.env.example‎
diff --git a/‎examples/benchmarks/GenerateScraper/Readme.md‎
Lines changed: 41 additions & 0 deletions b/‎examples/benchmarks/GenerateScraper/Readme.md‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎examples/benchmarks/GenerateScraper/benchmark_docker.py‎ b/‎examples/benchmarks/GenerateScraper/benchmark_docker.py‎
diff --git a/‎examples/benchmarks/GenerateScraper/benchmark_ollama.py‎
Lines changed: 62 additions & 0 deletions b/‎examples/benchmarks/GenerateScraper/benchmark_ollama.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py‎
Lines changed: 53 additions & 0 deletions b/‎examples/benchmarks/GenerateScraper/benchmark_openai_gpt35.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py‎
Lines changed: 53 additions & 0 deletions b/‎examples/benchmarks/GenerateScraper/benchmark_openai_gpt4.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎examples/benchmarks/inputs/example_1.txt‎ renamed to ‎examples/benchmarks/GenerateScraper/inputs/example_1.txt‎ b/‎examples/benchmarks/inputs/example_1.txt‎ renamed to ‎examples/benchmarks/GenerateScraper/inputs/example_1.txt‎
diff --git a/‎examples/benchmarks/inputs/example_2.txt‎ renamed to ‎examples/benchmarks/GenerateScraper/inputs/example_2.txt‎ b/‎examples/benchmarks/inputs/example_2.txt‎ renamed to ‎examples/benchmarks/GenerateScraper/inputs/example_2.txt‎
diff --git a/‎examples/benchmarks/SmartScraper/.env.example‎
Lines changed: 1 addition & 0 deletions b/‎examples/benchmarks/SmartScraper/.env.example‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/benchmarks/SmartScraper/Readme.md‎
Lines changed: 2 additions & 0 deletions b/‎examples/benchmarks/SmartScraper/Readme.md‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,41 @@
+# Local models
+The two websites benchmark are:
+- Example 1:  https://perinim.github.io/projects
+- Example 2: https://www.wired.com (at 17/4/2024)
+
+Both are strored locally as txt file in .txt format  because in this way we do not have to think about the internet connection
+
+The time is measured in seconds
+
+The model runned for this benchmark is Mistral on Ollama with nomic-embed-text
+
+| Hardware           | Example 1 | Example 2 |
+| ------------------ | --------- | --------- |
+| Macbook 14' m1 pro | 30.54     | 35.76     |
+| Macbook m2 max     |           |           |
+
+
+**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following:
+
+| Hardware           | Example 1 | Example 2 |
+| ------------------ | --------- | --------- |
+| Macbook 14' m1 pro |           |           |
+# Performance on APIs services
+### Example 1: personal portfolio 
+**URL**: https://perinim.github.io/projects
+**Task**: List me all the projects with their description.
+
+| Name                | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
+| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
+| gpt-3.5-turbo       | 24.215268                | 1892         | 1802          | 90                | 1                   | 0.002883       |
+| gpt-4-turbo-preview | 6.614                    | 1936         | 1802          | 134               | 1                   | 0.02204        |
+
+### Example 2: Wired
+**URL**: https://www.wired.com
+**Task**: List me all the articles with their description.
+
+| Name                | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
+| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
+| gpt-3.5-turbo       |                          |              |               |                   |                     |                |
+| gpt-4-turbo-preview |                          |              |               |                   |                     |                |
+
@@ -0,0 +1,62 @@
+""" 
+Basic example of scraping pipeline using SmartScraper from text
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
+load_dotenv()
+
+# ************************************************
+# Read the text file
+# ************************************************
+files = ["inputs/example_1.txt", "inputs/example_2.txt"]
+tasks = ["List me all the projects with their description.",
+         "List me all the articles with their description."]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("GPT4_KEY")
+
+
+graph_config = {
+    "llm": {
+        "model": "ollama/mistral",
+        "temperature": 0,
+        # "model_tokens": 2000, # set context length arbitrarily,
+        "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "library": "beautifoulsoup"
+}
+
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+for i in range(0, 2):
+    with open(files[i], 'r', encoding="utf-8") as file:
+        text = file.read()
+
+    smart_scraper_graph = ScriptCreatorGraph(
+        prompt=tasks[i],
+        source=text,
+        config=graph_config
+    )
+
+    result = smart_scraper_graph.run()
+    print(result)
+    # ************************************************
+    # Get graph execution info
+    # ************************************************
+
+    graph_exec_info = smart_scraper_graph.get_execution_info()
+    print(prettify_exec_info(graph_exec_info))
@@ -0,0 +1,53 @@
+""" 
+Basic example of scraping pipeline using SmartScraper from text
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
+load_dotenv()
+
+# ************************************************
+# Read the text file
+# ************************************************
+files = ["inputs/example_1.txt", "inputs/example_2.txt"]
+tasks = ["List me all the projects with their description.",
+         "List me all the articles with their description."]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("GPT35_KEY")
+
+graph_config = {
+    "llm": {
+        "api_key": openai_key,
+        "model": "gpt-3.5-turbo",
+    },
+    "library": "beautifoulsoup"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+for i in range(0, 2):
+    with open(files[i], 'r', encoding="utf-8") as file:
+        text = file.read()
+
+    smart_scraper_graph = ScriptCreatorGraph(
+        prompt=tasks[i],
+        source=text,
+        config=graph_config
+    )
+
+    result = smart_scraper_graph.run()
+    print(result)
+    # ************************************************
+    # Get graph execution info
+    # ************************************************
+
+    graph_exec_info = smart_scraper_graph.get_execution_info()
+    print(prettify_exec_info(graph_exec_info))
@@ -0,0 +1,53 @@
+""" 
+Basic example of scraping pipeline using SmartScraper from text
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
+load_dotenv()
+
+# ************************************************
+# Read the text file
+# ************************************************
+files = ["inputs/example_1.txt", "inputs/example_2.txt"]
+tasks = ["List me all the projects with their description.",
+         "List me all the articles with their description."]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("GPT4_KEY")
+
+graph_config = {
+    "llm": {
+        "api_key": openai_key,
+        "model": "gpt-4-turbo-preview",
+    },
+    "library": "beautifoulsoup"
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+for i in range(0, 2):
+    with open(files[i], 'r', encoding="utf-8") as file:
+        text = file.read()
+
+    smart_scraper_graph = ScriptCreatorGraph(
+        prompt=tasks[i],
+        source=text,
+        config=graph_config
+    )
+
+    result = smart_scraper_graph.run()
+    print(result)
+    # ************************************************
+    # Get graph execution info
+    # ************************************************
+
+    graph_exec_info = smart_scraper_graph.get_execution_info()
+    print(prettify_exec_info(graph_exec_info))
@@ -0,0 +1 @@
+OPENAI_APIKEY="your openai api key"
@@ -0,0 +1,2 @@
+This folder contains all the scripts used for benchmarks
+Remember if you use openai to set the keys or if you use Ollama/Docker to set the setup
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+This folder contains all the scripts used for benchmarks`
	`2`	`+Remember if you use openai to set the keys or if you use Ollama/Docker to set the setup`