rollback

VinciGit00 · VinciGit00 · commit 6d33a8a25ef9 · 2024-05-23T18:44:04.000+02:00
diff --git a/examples/example.py b/examples/example.py
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -8,16 +8,21 @@
 from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings
 from langchain_google_genai import GoogleGenerativeAIEmbeddings
 from ..helpers import models_tokens
-from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic
+from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek
 from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
 
+from ..helpers import models_tokens
+from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek
+
+
 class AbstractGraph(ABC):
     """
     Scaffolding class for creating a graph representation and executing it.
 
         prompt (str): The prompt for the graph.
         source (str): The source of the graph.
         config (dict): Configuration parameters for the graph.
+        schema (str): The schema for the graph output.
         llm_model: An instance of a language model client, configured for generating answers.
         embedder_model: An instance of an embedding model client,
                         configured for generating embeddings.
@@ -28,6 +33,7 @@ class AbstractGraph(ABC):
         prompt (str): The prompt for the graph.
         config (dict): Configuration parameters for the graph.
         source (str, optional): The source of the graph.
+        schema (str, optional): The schema for the graph output.
 
     Example:
         >>> class MyGraph(AbstractGraph):
@@ -39,34 +45,42 @@ class AbstractGraph(ABC):
         >>> result = my_graph.run()
     """
 
-    def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
+    def __init__(self, prompt: str, config: dict, source: Optional[str] = None, schema: Optional[str] = None):
 
         self.prompt = prompt
         self.source = source
         self.config = config
+        self.schema = schema
         self.llm_model = self._create_llm(config["llm"], chat=True)
         self.embedder_model = self._create_default_embedder(llm_config=config["llm"]
                                                             ) if "embeddings" not in config else self._create_embedder(
             config["embeddings"])
+        self.verbose = False if config is None else config.get(
+            "verbose", False)
+        self.headless = True if config is None else config.get(
+            "headless", True)
+        self.loader_kwargs = config.get("loader_kwargs", {})
 
         # Create the graph
         self.graph = self._create_graph()
         self.final_state = None
         self.execution_info = None
 
         # Set common configuration parameters
-    
         self.verbose = False if config is None else config.get(
             "verbose", False)
         self.headless = True if config is None else config.get(
             "headless", True)
         self.loader_kwargs = config.get("loader_kwargs", {})
 
-        common_params = {"headless": self.headless,
-                     
-                         "loader_kwargs": self.loader_kwargs,
-                         "llm_model": self.llm_model,
-                         "embedder_model": self.embedder_model}
+        common_params = {
+            "headless": self.headless,
+            "verbose": self.verbose,
+            "loader_kwargs": self.loader_kwargs,
+            "llm_model": self.llm_model,
+            "embedder_model": self.embedder_model
+            }
+        
         self.set_common_params(common_params, overwrite=False)
 
     def set_common_params(self, params: dict, overwrite=False):
@@ -79,7 +93,7 @@ def set_common_params(self, params: dict, overwrite=False):
 
         for node in self.graph.nodes:
             node.update_config(params, overwrite)
-        
+
     def _set_model_token(self, llm):
 
         if 'Azure' in str(type(llm)):
@@ -157,7 +171,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
                 raise KeyError("Model not supported") from exc
             return Anthropic(llm_params)
         elif "ollama" in llm_params["model"]:
-            llm_params["model"] = llm_params["model"].split("/")[-1]
+            llm_params["model"] = llm_params["model"].split("ollama/")[-1]
 
             # allow user to set model_tokens in config
             try:
@@ -231,6 +245,8 @@ def _create_default_embedder(self, llm_config=None) -> object:
                                                 model="models/embedding-001")
         if isinstance(self.llm_model, OpenAI):
             return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
+        elif isinstance(self.llm_model, DeepSeek):
+            return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)   
         elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
             return self.llm_model
         elif isinstance(self.llm_model, AzureOpenAI):
@@ -271,7 +287,7 @@ def _create_embedder(self, embedder_config: dict) -> object:
         elif "azure" in embedder_config["model"]:
             return AzureOpenAIEmbeddings()
         elif "ollama" in embedder_config["model"]:
-            embedder_config["model"] = embedder_config["model"].split("/")[-1]
+            embedder_config["model"] = embedder_config["model"].split("ollama/")[-1]
             try:
                 models_tokens["ollama"][embedder_config["model"]]
             except KeyError as exc:
@@ -297,6 +313,10 @@ def _create_embedder(self, embedder_config: dict) -> object:
             except KeyError as exc:
                 raise KeyError("Model not supported") from exc
             return BedrockEmbeddings(client=client, model_id=embedder_config["model"])
+        else:
+            raise ValueError(
+                "Model provided by the configuration not supported")
+
     def get_state(self, key=None) -> dict:
         """""
         Get the final state of the graph.
@@ -334,4 +354,4 @@ def run(self) -> str:
         """
         Abstract method to execute the graph and return the result.
         """
-        pass
+        pass
diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py
@@ -11,7 +11,7 @@
     FetchNode,
     ParseNode,
     RAGNode,
-    GenerateAnswerPDFNode
+    GenerateAnswerNode
 )
 
 
@@ -48,7 +48,7 @@ class PDFScraperGraph(AbstractGraph):
     """
 
     def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
-        super().__init__(prompt, config, source)
+        super().__init__(prompt, config, source, schema)
 
         self.input_key = "pdf" if source.endswith("pdf") else "pdf_dir"
 
@@ -64,21 +64,41 @@ def _create_graph(self) -> BaseGraph:
             input='pdf | pdf_dir',
             output=["doc", "link_urls", "img_urls"],
         )
-        generate_answer_node_pdf = GenerateAnswerPDFNode(
+        parse_node = ParseNode(
+            input="doc",
+            output=["parsed_doc"],
+            node_config={
+                "chunk_size": self.model_token,
+            }
+        )
+        rag_node = RAGNode(
+            input="user_prompt & (parsed_doc | doc)",
+            output=["relevant_chunks"],
+            node_config={
+                "llm_model": self.llm_model,
+                "embedder_model": self.embedder_model,
+            }
+        )
+        generate_answer_node = GenerateAnswerNode(
             input="user_prompt & (relevant_chunks | parsed_doc | doc)",
             output=["answer"],
             node_config={
                 "llm_model": self.llm_model,
+                "schema": self.schema,
             }
         )
 
         return BaseGraph(
             nodes=[
                 fetch_node,
-                generate_answer_node_pdf,
+                parse_node,
+                rag_node,
+                generate_answer_node,
             ],
             edges=[
-                (fetch_node, generate_answer_node_pdf)
+                (fetch_node, parse_node),
+                (parse_node, rag_node),
+                (rag_node, generate_answer_node)
             ],
             entry_point=fetch_node
         )
@@ -94,4 +114,4 @@ def run(self) -> str:
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("answer", "No answer found.")
+        return self.final_state.get("answer", "No answer found.")
diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -2,17 +2,14 @@
 SmartScraperGraph Module
 """
 
-from typing import Optional
-
 from .base_graph import BaseGraph
-from .abstract_graph import AbstractGraph
-
 from ..nodes import (
     FetchNode,
     ParseNode,
     RAGNode,
     GenerateAnswerNode
 )
+from .abstract_graph import AbstractGraph
 
 
 class SmartScraperGraph(AbstractGraph):
@@ -25,7 +22,6 @@ class SmartScraperGraph(AbstractGraph):
         prompt (str): The prompt for the graph.
         source (str): The source of the graph.
         config (dict): Configuration parameters for the graph.
-        schema (str): The schema for the graph output.
         llm_model: An instance of a language model client, configured for generating answers.
         embedder_model: An instance of an embedding model client, 
         configured for generating embeddings.
@@ -36,7 +32,6 @@ class SmartScraperGraph(AbstractGraph):
         prompt (str): The prompt for the graph.
         source (str): The source of the graph.
         config (dict): Configuration parameters for the graph.
-        schema (str): The schema for the graph output.
 
     Example:
         >>> smart_scraper = SmartScraperGraph(
@@ -48,8 +43,8 @@ class SmartScraperGraph(AbstractGraph):
         )
     """
 
-    def __init__(self, prompt: str, source: str, config: dict, schema: Optional[str] = None):
-        super().__init__(prompt, config, source, schema)
+    def __init__(self, prompt: str, source: str, config: dict):
+        super().__init__(prompt, config, source)
 
         self.input_key = "url" if source.startswith("http") else "local_dir"
 
@@ -86,8 +81,7 @@ def _create_graph(self) -> BaseGraph:
             input="user_prompt & (relevant_chunks | parsed_doc | doc)",
             output=["answer"],
             node_config={
-                "llm_model": self.llm_model,
-                "schema": self.schema,
+                "llm_model": self.llm_model
             }
         )