ScrapeGraphAI · VinciGit00 · Oct 12, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 12, 2024
diff --git a/examples/openai/md_scraper_openai.py b/examples/openai/md_scraper_openai.py
@@ -37,7 +37,7 @@
 # ************************************************
 
 md_scraper_graph = DocumentScraperGraph(
-    prompt="List me all the authors, title and genres of the books",
+    prompt="List me all the projects",
     source=text,  # Pass the content of the file, not the file object
     config=graph_config
 )

diff --git a/pyproject.toml b/pyproject.toml
@@ -60,6 +60,7 @@ keywords = [
     "web scraping tool",
     "webscraping",
     "graph",
+    "llm"
 ]
 classifiers = [
     "Intended Audience :: Developers",

diff --git a/scrapegraphai/builders/graph_builder.py b/scrapegraphai/builders/graph_builder.py
@@ -120,7 +120,7 @@ def build_graph(self):
         Returns:
             dict: A JSON representation of the graph configuration.
         """
-        return self.chain.invoke(self.prompt)
+        return self.chain.ainvoke(self.prompt)
 
     @staticmethod
     def convert_json_to_graphviz(json_data, format: str = 'pdf'):

diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py
@@ -60,7 +60,7 @@ def __init__(
 
         self.additional_info = node_config.get("additional_info")
 
-    def execute(self, state):
+    async def execute(self, state):
         """
         Generates an answer by constructing a prompt from the user's input and the scraped
         content, querying the language model, and parsing its response.
@@ -126,7 +126,7 @@ def execute(self, state):
             )
 
             chain =  prompt | self.llm_model | output_parser
-            answer = chain.invoke({"question": user_prompt})
+            answer = chain.ainvoke({"question": user_prompt})
             state.update({self.output[0]: answer})
             return state
 
@@ -157,7 +157,7 @@ def execute(self, state):
             )
 
         merge_chain = merge_prompt | self.llm_model | output_parser
-        answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
+        answer = await merge_chain.ainvoke({"context": batch_results, "question": user_prompt})
 
         state.update({self.output[0]: answer})
         return state
diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -1,5 +1,5 @@
 """
-generate_answer_node module
+GenerateAnswerNode Module
 """
 from typing import List, Optional
 from langchain.prompts import PromptTemplate
@@ -19,24 +19,24 @@
 
 class GenerateAnswerNode(BaseNode):
     """
-        Initializes the GenerateAnswerNode class.
-
-        Args:
-            input (str): The input data type for the node.
-            output (List[str]): The output data type(s) for the node.
-            node_config (Optional[dict]): Configuration dictionary for the node, 
-            which includes the LLM model, verbosity, schema, and other settings. 
-            Defaults to None.
-            node_name (str): The name of the node. Defaults to "GenerateAnswer".
-
-        Attributes:
-            llm_model: The language model specified in the node configuration.
-            verbose (bool): Whether verbose mode is enabled.
-            force (bool): Whether to force certain behaviors, overriding defaults.
-            script_creator (bool): Whether the node is in script creation mode.
-            is_md_scraper (bool): Whether the node is scraping markdown data.
-            additional_info (Optional[str]): Any additional information to be 
-            included in the prompt templates.
+    Initializes the GenerateAnswerNode class.
+
+    Args:
+        input (str): The input data type for the node.
+        output (List[str]): The output data type(s) for the node.
+        node_config (Optional[dict]): Configuration dictionary for the node,
+        which includes the LLM model, verbosity, schema, and other settings.
+        Defaults to None.
+        node_name (str): The name of the node. Defaults to "GenerateAnswer".
+
+    Attributes:
+        llm_model: The language model specified in the node configuration.
+        verbose (bool): Whether verbose mode is enabled.
+        force (bool): Whether to force certain behaviors, overriding defaults.
+        script_creator (bool): Whether the node is in script creation mode.
+        is_md_scraper (bool): Whether the node is scraping markdown data.
+        additional_info (Optional[str]): Any additional information to be
+        included in the prompt templates.
     """
     def __init__(
         self,
@@ -57,7 +57,17 @@ def __init__(
         self.is_md_scraper = node_config.get("is_md_scraper", False)
         self.additional_info = node_config.get("additional_info")
 
-    def execute(self, state: dict) -> dict:
+    async def execute(self, state: dict) -> dict:
+        """
+        Executes the GenerateAnswerNode.
+
+        Args:
+            state (dict): The current state of the graph. The input keys will be used
+                          to fetch the correct data from the state.
+
+        Returns:
+            dict: The updated state with the output key containing the generated answer.
+        """
         self.logger.info(f"--- Executing {self.node_name} Node ---")
 
         input_keys = self.get_input_keys(state)
@@ -113,7 +123,7 @@ def execute(self, state: dict) -> dict:
             chain = prompt | self.llm_model
             if output_parser:
                 chain = chain | output_parser
-            answer = chain.invoke({"question": user_prompt})
+            answer = await chain.ainvoke({"question": user_prompt})
 
             state.update({self.output[0]: answer})
             return state
@@ -133,7 +143,7 @@ def execute(self, state: dict) -> dict:
                 chains_dict[chain_name] = chains_dict[chain_name] | output_parser
 
         async_runner = RunnableParallel(**chains_dict)
-        batch_results = async_runner.invoke({"question": user_prompt})
+        batch_results = await async_runner.ainvoke({"question": user_prompt})
 
         merge_prompt = PromptTemplate(
             template=template_merge_prompt,
@@ -144,7 +154,7 @@ def execute(self, state: dict) -> dict:
         merge_chain = merge_prompt | self.llm_model
         if output_parser:
             merge_chain = merge_chain | output_parser
-        answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
+        answer = await merge_chain.ainvoke({"context": batch_results, "question": user_prompt})
 
         state.update({self.output[0]: answer})
         return state
diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py
@@ -143,7 +143,7 @@ def execute(self, state: dict) -> dict:
         merge_chain = merge_prompt | self.llm_model
         if output_parser:
             merge_chain = merge_chain | output_parser
-        answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
+        answer = merge_chain.ainvoke({"context": batch_results, "question": user_prompt})
 
         state["answer"] = answer
 

diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py
@@ -121,7 +121,7 @@ def execute(self, state: dict) -> dict:
             )
 
             chain =  prompt | self.llm_model | output_parser
-            answer = chain.invoke({"question": user_prompt})
+            answer = chain.ainvoke({"question": user_prompt})
 
             state.update({self.output[0]: answer})
             return state
@@ -154,7 +154,7 @@ def execute(self, state: dict) -> dict:
             )
 
         merge_chain = merge_prompt | self.llm_model | output_parser
-        answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
+        answer = merge_chain.ainvoke({"context": batch_results, "question": user_prompt})
 
         state.update({self.output[0]: answer})
         return state
diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py
@@ -128,7 +128,7 @@ def execute(self, state):
                 },
             )
             chain =  prompt | self.llm_model | output_parser
-            answer = chain.invoke({"question": user_prompt})
+            answer = chain.ainvoke({"question": user_prompt})
 
 
             state.update({self.output[0]: answer})
@@ -162,7 +162,7 @@ def execute(self, state):
             )
 
         merge_chain = merge_prompt | self.llm_model | output_parser
-        answer = merge_chain.invoke({"context": batch_results, "question": user_prompt})
+        answer = merge_chain.ainvoke({"context": batch_results, "question": user_prompt})
 
         state.update({self.output[0]: answer})
         return state
diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py
@@ -325,7 +325,7 @@ def generate_initial_code(self, state: dict) -> str:
         output_parser = StrOutputParser()
 
         chain =  prompt | self.llm_model | output_parser
-        generated_code = chain.invoke({})
+        generated_code = chain.ainvoke({})
         return generated_code
 
     def semantic_comparison(self, generated_result: Any, reference_result: Any) -> Dict[str, Any]:
@@ -368,7 +368,7 @@ def semantic_comparison(self, generated_result: Any, reference_result: Any) -> D
         )
 
         chain = prompt | self.llm_model | output_parser
-        return chain.invoke({
+        return chain.ainvoke({
             "generated_result": json.dumps(generated_result, indent=2),
             "reference_result": json.dumps(reference_result_dict, indent=2)
         })

diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py
@@ -131,7 +131,7 @@ def execute(self, state: dict) -> dict:
         )
         map_chain = prompt | self.llm_model | StrOutputParser()
 
-        answer = map_chain.invoke({"question": user_prompt})
+        answer = map_chain.ainvoke({"question": user_prompt})
 
         state.update({self.output[0]: answer})
         return state
diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py
@@ -93,7 +93,7 @@ def execute(self, state: dict) -> dict:
         output_parser = StrOutputParser()
 
         chain =  prompt | self.llm_model | output_parser
-        html_analysis = chain.invoke({})
+        html_analysis = chain.ainvoke({})
 
         state.update({self.output[0]: html_analysis, self.output[1]: reduced_html})
         return state
diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py
@@ -95,7 +95,7 @@ def execute(self, state: dict) -> dict:
         )
 
         merge_chain = prompt_template | self.llm_model | output_parser
-        answer = merge_chain.invoke({"user_prompt": user_prompt})
+        answer = merge_chain.ainvoke({"user_prompt": user_prompt})
         answer["sources"] = state.get("urls", [])
 
         state.update({self.output[0]: answer})

diff --git a/scrapegraphai/nodes/merge_generated_scripts_node.py b/scrapegraphai/nodes/merge_generated_scripts_node.py
@@ -74,7 +74,7 @@ def execute(self, state: dict) -> dict:
         )
 
         merge_chain = prompt_template | self.llm_model | StrOutputParser()
-        answer = merge_chain.invoke({"user_prompt": user_prompt})
+        answer = merge_chain.ainvoke({"user_prompt": user_prompt})
 
         state.update({self.output[0]: answer})
         return state
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -85,10 +85,13 @@ def execute(self, state: dict) -> dict:
         else:
             docs_transformed = docs_transformed[0]
 
-            link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
+            try:
+                link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source)
+            except Exception as e:
+                link_urls, img_urls = "", ""
 
             chunk_size = self.chunk_size
-            chunk_size = min(chunk_size - 500, int(chunk_size * 0.75))
+            chunk_size = min(chunk_size - 500, int(chunk_size * 0.8))
 
             if isinstance(docs_transformed, Document):
                 chunks = split_text_into_chunks(text=docs_transformed.page_content,

diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py
@@ -96,7 +96,7 @@ def execute(self, state: dict) -> dict:
         output_parser = StrOutputParser()
 
         chain =  prompt | self.llm_model | output_parser
-        refined_prompt = chain.invoke({})
+        refined_prompt = chain.ainvoke({})
 
         state.update({self.output[0]: refined_prompt})
         return state
diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py
@@ -91,7 +91,7 @@ def execute(self, state: dict) -> dict:
         output_parser = StrOutputParser()
 
         chain =  prompt | self.llm_model | output_parser
-        refined_prompt = chain.invoke({})
+        refined_prompt = chain.ainvoke({})
 
         state.update({self.output[0]: refined_prompt})
         return state
diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py
@@ -108,7 +108,7 @@ def execute(self, state: dict) -> dict:
             )
 
             chain = prompt | self.llm_model | output_parser
-            is_scrapable = chain.invoke({"path": source})[0]
+            is_scrapable = chain.ainvoke({"path": source})[0]
 
             if "no" in is_scrapable:
                 self.logger.warning(

diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py
@@ -142,7 +142,7 @@ def execute(self, state: dict) -> dict:
                     input_variables=["content", "user_prompt"],
                 )
                 merge_chain = merge_prompt | self.llm_model | output_parser
-                answer = merge_chain.invoke(
+                answer = merge_chain.ainvoke(
                     {"content": chunk.page_content}
                 )
                 relevant_links += answer

diff --git a/scrapegraphai/utils/code_error_analysis.py b/scrapegraphai/utils/code_error_analysis.py
@@ -31,7 +31,7 @@ def syntax_focused_analysis(state: dict, llm_model) -> str:
     prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS,
                             input_variables=["generated_code", "errors"])
     chain = prompt | llm_model | StrOutputParser()
-    return chain.invoke({
+    return chain.ainvoke({
         "generated_code": state["generated_code"],
         "errors": state["errors"]["syntax"]
     })
@@ -51,7 +51,7 @@ def execution_focused_analysis(state: dict, llm_model) -> str:
                             input_variables=["generated_code", "errors",
                                               "html_code", "html_analysis"])
     chain = prompt | llm_model | StrOutputParser()
-    return chain.invoke({
+    return chain.ainvoke({
         "generated_code": state["generated_code"],
         "errors": state["errors"]["execution"],
         "html_code": state["html_code"],
@@ -73,7 +73,7 @@ def validation_focused_analysis(state: dict, llm_model) -> str:
                             input_variables=["generated_code", "errors", 
                                              "json_schema", "execution_result"])
     chain = prompt | llm_model | StrOutputParser()
-    return chain.invoke({
+    return chain.ainvoke({
         "generated_code": state["generated_code"],
         "errors": state["errors"]["validation"],
         "json_schema": state["json_schema"],
@@ -97,7 +97,7 @@ def semantic_focused_analysis(state: dict, comparison_result: Dict[str, Any], ll
                             input_variables=["generated_code", 
                                              "differences", "explanation"])
     chain = prompt | llm_model | StrOutputParser()
-    return chain.invoke({
+    return chain.ainvoke({
         "generated_code": state["generated_code"],
         "differences": json.dumps(comparison_result["differences"], indent=2),
         "explanation": comparison_result["explanation"]

diff --git a/scrapegraphai/utils/code_error_correction.py b/scrapegraphai/utils/code_error_correction.py
@@ -33,7 +33,7 @@ def syntax_focused_code_generation(state: dict, analysis: str, llm_model) -> str
     prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION,
                             input_variables=["analysis", "generated_code"])
     chain = prompt | llm_model | StrOutputParser()
-    return chain.invoke({
+    return chain.ainvoke({
         "analysis": analysis,
         "generated_code": state["generated_code"]
     })
@@ -53,7 +53,7 @@ def execution_focused_code_generation(state: dict, analysis: str, llm_model) ->
     prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION,
                             input_variables=["analysis", "generated_code"])
     chain = prompt | llm_model | StrOutputParser()
-    return chain.invoke({
+    return chain.ainvoke({
         "analysis": analysis,
         "generated_code": state["generated_code"]
     })
@@ -73,7 +73,7 @@ def validation_focused_code_generation(state: dict, analysis: str, llm_model) ->
     prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION,
                             input_variables=["analysis", "generated_code", "json_schema"])
     chain = prompt | llm_model | StrOutputParser()
-    return chain.invoke({
+    return chain.ainvoke({
         "analysis": analysis,
         "generated_code": state["generated_code"],
         "json_schema": state["json_schema"]
@@ -94,7 +94,7 @@ def semantic_focused_code_generation(state: dict, analysis: str, llm_model) -> s
     prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION,
                             input_variables=["analysis", "generated_code", "generated_result", "reference_result"])
     chain = prompt | llm_model | StrOutputParser()
-    return chain.invoke({
+    return chain.ainvoke({
         "analysis": analysis,
         "generated_code": state["generated_code"],
         "generated_result": json.dumps(state["execution_result"], indent=2),