Merge branch 'pre/beta' into 133-support-claude3-haiku-and-others-using-litellm

PeriniM · web-flow · commit 0ab7272fd72b · 2024-05-05T02:15:15.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,79 @@
+## [0.9.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.9.0-beta.1...v0.9.0-beta.2) (2024-05-05)
+
+
+### Features
+
+* refactoring search function ([aeb1acb](https://github.com/VinciGit00/Scrapegraph-ai/commit/aeb1acbf05e63316c91672c99d88f8a6f338147f))
+
+
+### Bug Fixes
+
+* bug on .toml ([f7d66f5](https://github.com/VinciGit00/Scrapegraph-ai/commit/f7d66f51818dbdfddd0fa326f26265a3ab686b20))
+
+## [0.9.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.8.0...v0.9.0-beta.1) (2024-05-04)
+
+
+### Features
+
+* Enable end users to pass model instances of HuggingFaceHub ([7599234](https://github.com/VinciGit00/Scrapegraph-ai/commit/7599234ab9563ca4ee9b7f5b2d0267daac621ecf))
+
+
+### Build
+
+* **deps:** bump tqdm from 4.66.1 to 4.66.3 ([0a17c74](https://github.com/VinciGit00/Scrapegraph-ai/commit/0a17c74e50d0457aec289e81183e9c779c735842))
+* **deps:** bump tqdm from 4.66.1 to 4.66.3 ([aff6f98](https://github.com/VinciGit00/Scrapegraph-ai/commit/aff6f983b02a37ced21826847a6ace5fb15ecf3d))
+
+
+### CI
+
+* **release:** 0.8.0-beta.1 [skip ci] ([d277b34](https://github.com/VinciGit00/Scrapegraph-ai/commit/d277b349a98848749a7e38ea3c511271bced3b71))
+* **release:** 0.8.0-beta.2 [skip ci] ([892500a](https://github.com/VinciGit00/Scrapegraph-ai/commit/892500afe93c4d96dcffe897b382977a22079b83))
+
+## [0.8.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.7.0...v0.8.0) (2024-05-03)
+
+
+
+### Features
+
+* add pdf scraper ([10a9453](https://github.com/VinciGit00/Scrapegraph-ai/commit/10a94530e3fd4dfde933ecfa96cb3e21df72e606))
+
+
+### CI
+
+* **release:** 0.7.0-beta.3 [skip ci] ([fbb06ab](https://github.com/VinciGit00/Scrapegraph-ai/commit/fbb06ab551fac9cc9824ad567f042e55450277bd))
+
+## [0.7.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.2...v0.7.0) (2024-05-03)
+
+### Features
+
+* add base_node to __init__.py ([cb1cb61](https://github.com/VinciGit00/Scrapegraph-ai/commit/cb1cb616b7998d3624bf57b19b5f1b1945fea4ef))
+* Azure implementation + embeddings refactoring ([aa9271e](https://github.com/VinciGit00/Scrapegraph-ai/commit/aa9271e7bc4daa54860499d0615580b17550ff58))
+
+
+### Refactor
+
+* Changed the way embedding model is created in AbstractGraph class and removed handling of embedding model creation from RAGNode. Now AbstractGraph will call a dedicated method for embedding models instead of _create_llm. This makes it easy to use any LLM with any supported embedding model. ([819cbcd](https://github.com/VinciGit00/Scrapegraph-ai/commit/819cbcd3be1a8cb195de0b44c6b6d4d824e2a42a))
+
+
+### CI
+
+* **release:** 0.7.0-beta.1 [skip ci] ([98dec36](https://github.com/VinciGit00/Scrapegraph-ai/commit/98dec36c60d1dc8b072482e8d514c3869a45a3f8))
+* **release:** 0.7.0-beta.2 [skip ci] ([42fa02e](https://github.com/VinciGit00/Scrapegraph-ai/commit/42fa02e65a3a81796bd66e55cf9dd1d1b692cb89))
+
+
+## [0.7.0-beta.3](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.7.0-beta.2...v0.7.0-beta.3) (2024-05-03)
+## [0.7.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.7.0-beta.1...v0.7.0-beta.2) (2024-05-03)
+
+
+### Features
+
+* Azure implementation + embeddings refactoring ([aa9271e](https://github.com/VinciGit00/Scrapegraph-ai/commit/aa9271e7bc4daa54860499d0615580b17550ff58))
+* add pdf scraper ([10a9453](https://github.com/VinciGit00/Scrapegraph-ai/commit/10a94530e3fd4dfde933ecfa96cb3e21df72e606))
+
+### Refactor
+
+* Changed the way embedding model is created in AbstractGraph class and removed handling of embedding model creation from RAGNode. Now AbstractGraph will call a dedicated method for embedding models instead of _create_llm. This makes it easy to use any LLM with any supported embedding model. ([819cbcd](https://github.com/VinciGit00/Scrapegraph-ai/commit/819cbcd3be1a8cb195de0b44c6b6d4d824e2a42a))
+
 ## [0.7.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.2...v0.7.0-beta.1) (2024-05-03)
 
 
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
@@ -19,6 +19,12 @@ Install the library
 
    pip install scrapegraphai
 
+Additionally on Windows when using WSL
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+   sudo apt-get -y install libnss3 libnspr4 libgbm1 libasound2
 
 As simple as that! You are now ready to scrape gnamgnamgnam 👿👿👿
 
diff --git a/examples/huggingfacehub/smart_scraper_huggingfacehub.py b/examples/huggingfacehub/smart_scraper_huggingfacehub.py
@@ -0,0 +1,63 @@
+""" 
+Basic example of scraping pipeline using SmartScraper using Azure OpenAI Key
+"""
+
+import os
+from dotenv import load_dotenv
+from scrapegraphai.graphs import SmartScraperGraph
+from scrapegraphai.utils import prettify_exec_info
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+
+
+
+
+## required environment variable in .env
+#HUGGINGFACEHUB_API_TOKEN
+load_dotenv()
+
+HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+# ************************************************
+# Initialize the model instances
+# ************************************************
+
+repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
+
+llm_model_instance = HuggingFaceEndpoint(
+    repo_id=repo_id, max_length=128, temperature=0.5, token=HUGGINGFACEHUB_API_TOKEN
+)
+
+
+
+
+embedder_model_instance = HuggingFaceInferenceAPIEmbeddings(
+    api_key=HUGGINGFACEHUB_API_TOKEN, model_name="sentence-transformers/all-MiniLM-l6-v2"
+)
+
+# ************************************************
+# Create the SmartScraperGraph instance and run it
+# ************************************************
+
+graph_config = {
+    "llm": {"model_instance": llm_model_instance},
+    "embeddings": {"model_instance": embedder_model_instance}
+}
+
+smart_scraper_graph = SmartScraperGraph(
+    prompt="List me all the events, with the following fields: company_name, event_name, event_start_date, event_start_time, event_end_date, event_end_time, location, event_mode, event_category, third_party_redirect, no_of_days, time_in_hours, hosted_or_attending, refreshments_type,  registration_available, registration_link",
+    # also accepts a string with the already downloaded HTML code
+    source="https://www.hmhco.com/event",
+    config=graph_config
+)
+
+result = smart_scraper_graph.run()
+print(result)
+
+# ************************************************
+# Get graph execution info
+# ************************************************
+
+graph_exec_info = smart_scraper_graph.get_execution_info()
+print(prettify_exec_info(graph_exec_info))
+
+
diff --git a/examples/openai/smart_scraper_openai.py b/examples/openai/smart_scraper_openai.py
@@ -21,7 +21,7 @@
         "api_key": openai_key,
         "model": "gpt-3.5-turbo",
     },
-    "verbose":False,
+    "verbose": True,
 }
 
 # ************************************************
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "scrapegraphai"
 
-version = "0.7.0b1"
+version = "0.9.0b2"
 
 description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
 authors = [
@@ -33,7 +33,7 @@ beautifulsoup4 = "4.12.3"
 pandas = "2.0.3"
 python-dotenv = "1.0.1"
 tiktoken = {version = ">=0.5.2,<0.6.0"}
-tqdm = "4.66.1"
+tqdm = "4.66.3"
 graphviz = "0.20.1"
 google = "3.0.0"
 minify-html = "0.15.0"
@@ -42,6 +42,7 @@ langchain-groq = "0.1.3"
 playwright = "^1.43.0"
 langchain-aws = "^0.1.2"
 langchain-anthropic = "^0.1.11"
+yahoo-search-py="^0.3"
 
 [tool.poetry.dev-dependencies]
 pytest = "8.0.0"
diff --git a/requirements.txt b/requirements.txt
@@ -7,12 +7,13 @@ beautifulsoup4==4.12.3
 pandas==2.0.3
 python-dotenv==1.0.1
 tiktoken>=0.5.2,<0.6.0
-tqdm==4.66.1
+tqdm==4.66.3
 graphviz==0.20.1
 google==3.0.0
 minify-html==0.15.0
 free-proxy==1.1.1
 langchain-groq==0.1.3
 playwright==1.43.0
 langchain-aws==0.1.2
-langchain-anthropic==0.1.11 
+langchain-anthropic==0.1.11 
+yahoo-search-py==0.3
diff --git a/scrapegraphai/graphs/__init__.py b/scrapegraphai/graphs/__init__.py
@@ -10,3 +10,4 @@
 from .xml_scraper_graph import XMLScraperGraph
 from .json_scraper_graph import JSONScraperGraph
 from .csv_scraper_graph import CSVScraperGraph
+from .pdf_scraper_graph import PDFScraperGraph
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -67,8 +67,15 @@ def _set_model_token(self, llm):
         if 'Azure' in str(type(llm)):
             try:
                 self.model_token = models_tokens["azure"][llm.model_name]
-            except KeyError as exc:
-                raise KeyError("Model not supported") from exc
+            except KeyError:
+                raise KeyError("Model not supported")
+                
+        elif 'HuggingFaceEndpoint' in str(type(llm)):
+            if 'mistral' in llm.repo_id:
+                try:
+                    self.model_token = models_tokens['mistral'][llm.repo_id]
+                except KeyError:
+                    raise KeyError("Model not supported")
 
     def _create_llm(self, llm_config: dict, chat=False) -> object:
         """
@@ -185,7 +192,6 @@ def _create_default_embedder(self) -> object:
         Raises:
             ValueError: If the model is not supported.
         """
-
         if isinstance(self.llm_model, OpenAI):
             return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
         elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
@@ -221,6 +227,9 @@ def _create_embedder(self, embedder_config: dict) -> object:
             KeyError: If the model is not supported.
         """
 
+        if 'model_instance' in embedder_config:
+            return embedder_config['model_instance']
+        
         # Instantiate the embedding model based on the model name
         if "openai" in embedder_config["model"]:
             return OpenAIEmbeddings(api_key=embedder_config["api_key"])
diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py
@@ -0,0 +1,118 @@
+"""
+PDFScraperGraph Module
+"""
+
+from .base_graph import BaseGraph
+from ..nodes import (
+    FetchNode,
+    ParseNode,
+    RAGNode,
+    GenerateAnswerNode
+)
+from .abstract_graph import AbstractGraph
+
+
+class PDFScraperGraph(AbstractGraph):
+    """
+    PDFScraperGraph is a scraping pipeline that extracts information from pdf files using a natural
+    language model to interpret and answer prompts.
+
+    Attributes:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+        llm_model: An instance of a language model client, configured for generating answers.
+        embedder_model: An instance of an embedding model client, 
+        configured for generating embeddings.
+        verbose (bool): A flag indicating whether to show print statements during execution.
+        headless (bool): A flag indicating whether to run the graph in headless mode.
+        model_token (int): The token limit for the language model.
+
+    Args:
+        prompt (str): The prompt for the graph.
+        source (str): The source of the graph.
+        config (dict): Configuration parameters for the graph.
+
+    Example:
+        >>> pdf_scraper = PDFScraperGraph(
+        ...     "List me all the attractions in Chioggia.",
+        ...     "data/chioggia.pdf",
+        ...     {"llm": {"model": "gpt-3.5-turbo"}}
+        ... )
+        >>> result = pdf_scraper.run()
+    """
+
+    def __init__(self, prompt: str, source: str, config: dict):
+        super().__init__(prompt, config, source)
+
+        self.input_key = "pdf" if source.endswith("pdf") else "pdf_dir"
+
+    def _create_graph(self) -> BaseGraph:
+        """
+        Creates the graph of nodes representing the workflow for web scraping.
+
+        Returns:
+            BaseGraph: A graph instance representing the web scraping workflow.
+        """
+
+        fetch_node = FetchNode(
+            input="pdf_dir",
+            output=["doc"],
+            node_config={
+                "headless": self.headless,
+                "verbose": self.verbose
+            }
+        )
+        parse_node = ParseNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={
+                "chunk_size": self.model_token,
+                "verbose": self.verbose
+            }
+        )
+        rag_node = RAGNode(
+            input="user_prompt & (parsed_doc | doc)",
+            output=["relevant_chunks"],
+            node_config={
+                "llm": self.llm_model,
+                "embedder_model": self.embedder_model,
+                "verbose": self.verbose
+            }
+        )
+        generate_answer_node = GenerateAnswerNode(
+            input="user_prompt & (relevant_chunks | parsed_doc | doc)",
+            output=["answer"],
+            node_config={
+                "llm": self.llm_model,
+                "verbose": self.verbose
+            }
+        )
+
+        return BaseGraph(
+            nodes=[
+                fetch_node,
+                parse_node,
+                rag_node,
+                generate_answer_node,
+            ],
+            edges=[
+                (fetch_node, parse_node),
+                (parse_node, rag_node),
+                (rag_node, generate_answer_node)
+            ],
+            entry_point=fetch_node
+        )
+
+    def run(self) -> str:
+        """
+        Executes the web scraping process and returns the answer to the prompt.
+
+        Returns:
+            str: The answer to the prompt.
+        """
+
+        inputs = {"user_prompt": self.prompt, self.input_key: self.source}
+        self.final_state, self.execution_info = self.graph.execute(inputs)
+
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/helpers/models_tokens.py b/scrapegraphai/helpers/models_tokens.py
@@ -35,7 +35,8 @@
         "codellama": 16000,
         "dolphin-mixtral": 32000,
         "mistral-openorca": 32000,
-        "stablelm-zephyr": 8192
+        "stablelm-zephyr": 8192,
+        "nomic-embed-text":8192
     },
     "groq": {
         "llama3-8b-8192": 8192,
@@ -65,5 +66,8 @@
         "mistral.mistral-large-2402-v1:0": 32768,
         "cohere.embed-english-v3": 512,
         "cohere.embed-multilingual-v3": 512
+    },
+    "mistral": {
+        "mistralai/Mistral-7B-Instruct-v0.2": 32000
     }
 }
diff --git a/scrapegraphai/nodes/__init__.py b/scrapegraphai/nodes/__init__.py
@@ -16,3 +16,4 @@
 from .search_link_node import SearchLinkNode
 from .robots_node import RobotsNode
 from .generate_answer_csv_node import GenerateAnswerCSVNode
+from .generate_answer_pdf_node import GenerateAnswerPDFNode
diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py
diff --git a/scrapegraphai/nodes/rag_node.py b/scrapegraphai/nodes/rag_node.py
diff --git a/scrapegraphai/utils/research_web.py b/scrapegraphai/utils/research_web.py

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@`
`21`	`21`	`"api_key": openai_key,`
`22`	`22`	`"model": "gpt-3.5-turbo",`
`23`	`23`	`},`
`24`		`- "verbose":False,`
	`24`	`+ "verbose": True,`
`25`	`25`	`}`
`26`	`26`
`27`	`27`	`# ************************************************`