Merge pull request #74 from VinciGit00/llama3

PeriniM · web-flow · commit b31391680e2c · 2024-04-21T17:03:52.000+02:00
add integration for llama3
diff --git a/examples/benchmarks/GenerateScraper/Readme.md b/examples/benchmarks/GenerateScraper/Readme.md
@@ -9,17 +9,15 @@ The time is measured in seconds
 
 The model runned for this benchmark is Mistral on Ollama with nomic-embed-text
 
-| Hardware           | Example 1 | Example 2 |
-| ------------------ | --------- | --------- |
-| Macbook 14' m1 pro | 30.54     | 35.76     |
-| Macbook m2 max     |           |           |
+| Hardware               | Model                                   | Example 1 | Example 2 |
+| ---------------------- | --------------------------------------- | --------- | --------- |
+| Macbook 14' m1 pro     | Mistral on Ollama with nomic-embed-text | 30.54s    | 35.76s    |
+| Macbook m2 max         | Mistral on Ollama with nomic-embed-text |           |           |
+| Macbook 14' m1 pro<br> | Llama3 on Ollama with nomic-embed-text  | 27.82s    | 29.986s   |
+| Macbook m2 max<br>     | Llama3 on Ollama with nomic-embed-text  |           |           |
 
 
-**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following:
-
-| Hardware           | Example 1 | Example 2 |
-| ------------------ | --------- | --------- |
-| Macbook 14' m1 pro |           |           |
+**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). 
 # Performance on APIs services
 ### Example 1: personal portfolio 
 **URL**: https://perinim.github.io/projects
diff --git a/examples/benchmarks/GenerateScraper/benchmark_llama3.py b/examples/benchmarks/GenerateScraper/benchmark_llama3.py
@@ -0,0 +1,62 @@
+""" 
+Basic example of scraping pipeline using SmartScraper from text
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import ScriptCreatorGraph
+from scrapegraphai.utils import prettify_exec_info
+load_dotenv()
+
+# ************************************************
+# Read the text file
+# ************************************************
+files = ["inputs/example_1.txt", "inputs/example_2.txt"]
+tasks = ["List me all the projects with their description.",
+         "List me all the articles with their description."]
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+openai_key = os.getenv("GPT4_KEY")
+
+
+graph_config = {
+    "llm": {
+        "model": "ollama/llama3",
+        "temperature": 0,
+        # "model_tokens": 2000, # set context length arbitrarily,
+        "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
+    },
+    "library": "beautifoulsoup"
+}
+
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+for i in range(0, 2):
+    with open(files[i], 'r', encoding="utf-8") as file:
+        text = file.read()
+
+    smart_scraper_graph = ScriptCreatorGraph(
+        prompt=tasks[i],
+        source=text,
+        config=graph_config
+    )
+
+    result = smart_scraper_graph.run()
+    print(result)
+    # ************************************************
+    # Get graph execution info
+    # ************************************************
+
+    graph_exec_info = smart_scraper_graph.get_execution_info()
+    print(prettify_exec_info(graph_exec_info))
diff --git a/examples/benchmarks/GenerateScraper/benchmark_mistral.py b/examples/benchmarks/GenerateScraper/benchmark_mistral.py
diff --git a/examples/benchmarks/SmartScraper/Readme.md b/examples/benchmarks/SmartScraper/Readme.md
@@ -1,2 +1,39 @@
-This folder contains all the scripts used for benchmarks
-Remember if you use openai to set the keys or if you use Ollama/Docker to set the setup
+# Local models
+The two websites benchmark are:
+- Example 1:  https://perinim.github.io/projects
+- Example 2: https://www.wired.com (at 17/4/2024)
+
+Both are strored locally as txt file in .txt format  because in this way we do not have to think about the internet connection
+
+| Hardware           | Moodel                                  | Example 1 | Example 2 |
+| ------------------ | --------------------------------------- | --------- | --------- |
+| Macbook 14' m1 pro | Mistral on Ollama with nomic-embed-text | 11.60s    | 26.61s    |
+| Macbook m2 max     | Mistral on Ollama with nomic-embed-text | 8.05s     | 12.17s    |
+| Macbook 14' m1 pro | Llama3 on Ollama with nomic-embed-text  | 29.871    | 35.32     |
+| Macbook m2 max     | Llama3 on Ollama with nomic-embed-text  |           |           |
+
+
+**Note**: the examples on Docker are not runned on other devices than the Macbook because the performance are to slow (10 times slower than Ollama). Indeed the results are the following:
+
+| Hardware           | Example 1 | Example 2 |
+| ------------------ | --------- | --------- |
+| Macbook 14' m1 pro | 139.89    | Too long  |
+# Performance on APIs services
+### Example 1: personal portfolio 
+**URL**: https://perinim.github.io/projects
+**Task**: List me all the projects with their description.
+
+| Name                | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
+| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
+| gpt-3.5-turbo       | 25.22                    | 445          | 272           | 173               | 1                   | 0.000754       |
+| gpt-4-turbo-preview | 9.53                     | 449          | 272           | 177               | 1                   | 0.00803        |
+
+### Example 2: Wired
+**URL**: https://www.wired.com
+**Task**: List me all the articles with their description.
+
+| Name                | Execution time (seconds) | total_tokens | prompt_tokens | completion_tokens | successful_requests | total_cost_USD |
+| ------------------- | ------------------------ | ------------ | ------------- | ----------------- | ------------------- | -------------- |
+| gpt-3.5-turbo       | 25.89                    | 445          | 272           | 173               | 1                   | 0.000754       |
+| gpt-4-turbo-preview | 64.70                    | 3573         | 2199          | 1374              | 1                   | 0.06321        |
+
diff --git a/examples/benchmarks/SmartScraper/benchmark_llama3.py b/examples/benchmarks/SmartScraper/benchmark_llama3.py
@@ -0,0 +1,54 @@
+""" 
+Basic example of scraping pipeline using SmartScraper from text
+"""
+
+import os
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+
+files = ["inputs/example_1.txt", "inputs/example_2.txt"]
+tasks = ["List me all the projects with their description.",
+         "List me all the articles with their description."]
+
+
+# ************************************************
+# Define the configuration for the graph
+# ************************************************
+
+graph_config = {
+    "llm": {
+        "model": "ollama/llama3",
+        "temperature": 0,
+        "format": "json",  # Ollama needs the format to be specified explicitly
+        # "model_tokens": 2000, # set context length arbitrarily
+        "base_url": "http://localhost:11434",
+    },
+    "embeddings": {
+        "model": "ollama/nomic-embed-text",
+        "temperature": 0,
+        "base_url": "http://localhost:11434",
+    }
+}
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+for i in range(0, 2):
+    with open(files[i], 'r', encoding="utf-8") as file:
+        text = file.read()
+
+    smart_scraper_graph = SmartScraperGraph(
+        prompt=tasks[i],
+        source=text,
+        config=graph_config
+    )
+
+    result = smart_scraper_graph.run()
+    print(result)
+    # ************************************************
+    # Get graph execution info
+    # ************************************************
+
+    graph_exec_info = smart_scraper_graph.get_execution_info()
+    print(prettify_exec_info(graph_exec_info))
diff --git a/examples/benchmarks/SmartScraper/benchmark_mistral.py b/examples/benchmarks/SmartScraper/benchmark_mistral.py
diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -23,6 +23,7 @@
 
     "ollama": {
         "llama2": 4096,
+        "llama3": 8192,
         "mistral": 8192,
         "codellama": 16000,
         "dolphin-mixtral": 32000,
diff --git a/tests/Readme.md b/tests/Readme.md
@@ -3,3 +3,8 @@
 Regarding the tests for the folder graphs and nodes it was created a specific repo as a example
 ([link of the repo](https://github.com/VinciGit00/Scrapegrah-ai-website-for-tests)). The test website is hosted [here](https://scrapegrah-ai-website-for-tests.onrender.com).
 Remember to activating Ollama and having installed the LLM on your pc
+
+For running the tests run the command:
+```python
+pytests
+```
diff --git a/tests/scrape_plain_text_llama3_test.py b/tests/scrape_plain_text_llama3_test.py
@@ -0,0 +1,56 @@
+"""
+Module for the tests
+"""
+import os
+import pytest
+from scrapegraphai.graphs import SmartScraperGraph
+
+
+@pytest.fixture
+def sample_text():
+    """
+    Example of text
+    """
+    file_name = "inputs/plain_html_example.txt"
+    curr_dir = os.path.dirname(os.path.realpath(__file__))
+    file_path = os.path.join(curr_dir, file_name)
+
+    with open(file_path, 'r', encoding="utf-8") as file:
+        text = file.read()
+
+    return text
+
+
+@pytest.fixture
+def graph_config():
+    """
+    Configuration of the graph
+    """
+    return {
+        "llm": {
+            "model": "ollama/llama3",
+            "temperature": 0,
+            "format": "json",
+            "base_url": "http://localhost:11434",
+        },
+        "embeddings": {
+            "model": "ollama/nomic-embed-text",
+            "temperature": 0,
+            "base_url": "http://localhost:11434",
+        }
+    }
+
+
+def test_scraping_pipeline(sample_text: str, graph_config: dict):
+    """
+    Start of the scraping pipeline
+    """
+    smart_scraper_graph = SmartScraperGraph(
+        prompt="List me all the news with their description.",
+        source=sample_text,
+        config=graph_config
+    )
+
+    result = smart_scraper_graph.run()
+
+    assert result is not None
diff --git a/tests/scrape_plain_text_mistral_test.py b/tests/scrape_plain_text_mistral_test.py