From 3b7b701a89aad503dea771db3f043167f7203d46 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 11 Oct 2024 08:32:41 +0200 Subject: [PATCH 1/3] feat: refactoring of mdscraper --- examples/openai/md_scraper_openai.py | 2 +- scrapegraphai/nodes/parse_node.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/openai/md_scraper_openai.py b/examples/openai/md_scraper_openai.py index 3a152243..118e7d59 100644 --- a/examples/openai/md_scraper_openai.py +++ b/examples/openai/md_scraper_openai.py @@ -37,7 +37,7 @@ # ************************************************ md_scraper_graph = DocumentScraperGraph( - prompt="List me all the authors, title and genres of the books", + prompt="List me all the projects", source=text, # Pass the content of the file, not the file object config=graph_config ) diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py index fd2f3810..7c80373d 100644 --- a/scrapegraphai/nodes/parse_node.py +++ b/scrapegraphai/nodes/parse_node.py @@ -85,10 +85,13 @@ def execute(self, state: dict) -> dict: else: docs_transformed = docs_transformed[0] - link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source) + try: + link_urls, img_urls = self._extract_urls(docs_transformed.page_content, source) + except Exception as e: + link_urls, img_urls = "", "" chunk_size = self.chunk_size - chunk_size = min(chunk_size - 500, int(chunk_size * 0.75)) + chunk_size = min(chunk_size - 500, int(chunk_size * 0.8)) if isinstance(docs_transformed, Document): chunks = split_text_into_chunks(text=docs_transformed.page_content, From 257f393761e8ff823e37c72659c8b55925c4aecb Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Fri, 11 Oct 2024 09:31:18 +0200 Subject: [PATCH 2/3] feat: async invocation --- scrapegraphai/builders/graph_builder.py | 2 +- .../nodes/generate_answer_csv_node.py | 4 +-- scrapegraphai/nodes/generate_answer_node.py | 27 ++----------------- .../nodes/generate_answer_node_k_level.py | 2 +- .../nodes/generate_answer_omni_node.py | 4 +-- .../nodes/generate_answer_pdf_node.py | 4 +-- scrapegraphai/nodes/generate_code_node.py | 4 +-- scrapegraphai/nodes/generate_scraper_node.py | 2 +- scrapegraphai/nodes/html_analyzer_node.py | 2 +- scrapegraphai/nodes/merge_answers_node.py | 2 +- .../nodes/merge_generated_scripts_node.py | 2 +- scrapegraphai/nodes/prompt_refiner_node.py | 2 +- scrapegraphai/nodes/reasoning_node.py | 2 +- scrapegraphai/nodes/robots_node.py | 2 +- scrapegraphai/nodes/search_link_node.py | 2 +- scrapegraphai/utils/code_error_analysis.py | 8 +++--- scrapegraphai/utils/code_error_correction.py | 8 +++--- 17 files changed, 28 insertions(+), 51 deletions(-) diff --git a/scrapegraphai/builders/graph_builder.py b/scrapegraphai/builders/graph_builder.py index feb52ee3..307ea0dd 100644 --- a/scrapegraphai/builders/graph_builder.py +++ b/scrapegraphai/builders/graph_builder.py @@ -120,7 +120,7 @@ def build_graph(self): Returns: dict: A JSON representation of the graph configuration. """ - return self.chain.invoke(self.prompt) + return self.chain.ainvoke(self.prompt) @staticmethod def convert_json_to_graphviz(json_data, format: str = 'pdf'): diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index 0419d891..ed58d4ba 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -126,7 +126,7 @@ def execute(self, state): ) chain = prompt | self.llm_model | output_parser - answer = chain.invoke({"question": user_prompt}) + answer = chain.ainvoke({"question": user_prompt}) state.update({self.output[0]: answer}) return state @@ -157,7 +157,7 @@ def execute(self, state): ) merge_chain = merge_prompt | self.llm_model | output_parser - answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) + answer = merge_chain.ainvoke({"context": batch_results, "question": user_prompt}) state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index d5034a1e..332f9c30 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,6 +1,3 @@ -""" -generate_answer_node module -""" from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser @@ -18,26 +15,6 @@ ) class GenerateAnswerNode(BaseNode): - """ - Initializes the GenerateAnswerNode class. - - Args: - input (str): The input data type for the node. - output (List[str]): The output data type(s) for the node. - node_config (Optional[dict]): Configuration dictionary for the node, - which includes the LLM model, verbosity, schema, and other settings. - Defaults to None. - node_name (str): The name of the node. Defaults to "GenerateAnswer". - - Attributes: - llm_model: The language model specified in the node configuration. - verbose (bool): Whether verbose mode is enabled. - force (bool): Whether to force certain behaviors, overriding defaults. - script_creator (bool): Whether the node is in script creation mode. - is_md_scraper (bool): Whether the node is scraping markdown data. - additional_info (Optional[str]): Any additional information to be - included in the prompt templates. - """ def __init__( self, input: str, @@ -113,7 +90,7 @@ def execute(self, state: dict) -> dict: chain = prompt | self.llm_model if output_parser: chain = chain | output_parser - answer = chain.invoke({"question": user_prompt}) + answer = chain.ainvoke({"question": user_prompt}) state.update({self.output[0]: answer}) return state @@ -144,7 +121,7 @@ def execute(self, state: dict) -> dict: merge_chain = merge_prompt | self.llm_model if output_parser: merge_chain = merge_chain | output_parser - answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) + answer = merge_chain.ainvoke({"context": batch_results, "question": user_prompt}) state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/generate_answer_node_k_level.py b/scrapegraphai/nodes/generate_answer_node_k_level.py index 291109f2..63fbbeaa 100644 --- a/scrapegraphai/nodes/generate_answer_node_k_level.py +++ b/scrapegraphai/nodes/generate_answer_node_k_level.py @@ -143,7 +143,7 @@ def execute(self, state: dict) -> dict: merge_chain = merge_prompt | self.llm_model if output_parser: merge_chain = merge_chain | output_parser - answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) + answer = merge_chain.ainvoke({"context": batch_results, "question": user_prompt}) state["answer"] = answer diff --git a/scrapegraphai/nodes/generate_answer_omni_node.py b/scrapegraphai/nodes/generate_answer_omni_node.py index 403240dd..824d25c8 100644 --- a/scrapegraphai/nodes/generate_answer_omni_node.py +++ b/scrapegraphai/nodes/generate_answer_omni_node.py @@ -121,7 +121,7 @@ def execute(self, state: dict) -> dict: ) chain = prompt | self.llm_model | output_parser - answer = chain.invoke({"question": user_prompt}) + answer = chain.ainvoke({"question": user_prompt}) state.update({self.output[0]: answer}) return state @@ -154,7 +154,7 @@ def execute(self, state: dict) -> dict: ) merge_chain = merge_prompt | self.llm_model | output_parser - answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) + answer = merge_chain.ainvoke({"context": batch_results, "question": user_prompt}) state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/generate_answer_pdf_node.py b/scrapegraphai/nodes/generate_answer_pdf_node.py index 544184b4..09bb7aff 100644 --- a/scrapegraphai/nodes/generate_answer_pdf_node.py +++ b/scrapegraphai/nodes/generate_answer_pdf_node.py @@ -128,7 +128,7 @@ def execute(self, state): }, ) chain = prompt | self.llm_model | output_parser - answer = chain.invoke({"question": user_prompt}) + answer = chain.ainvoke({"question": user_prompt}) state.update({self.output[0]: answer}) @@ -162,7 +162,7 @@ def execute(self, state): ) merge_chain = merge_prompt | self.llm_model | output_parser - answer = merge_chain.invoke({"context": batch_results, "question": user_prompt}) + answer = merge_chain.ainvoke({"context": batch_results, "question": user_prompt}) state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/generate_code_node.py b/scrapegraphai/nodes/generate_code_node.py index e5f98f70..d6f4ce7c 100644 --- a/scrapegraphai/nodes/generate_code_node.py +++ b/scrapegraphai/nodes/generate_code_node.py @@ -325,7 +325,7 @@ def generate_initial_code(self, state: dict) -> str: output_parser = StrOutputParser() chain = prompt | self.llm_model | output_parser - generated_code = chain.invoke({}) + generated_code = chain.ainvoke({}) return generated_code def semantic_comparison(self, generated_result: Any, reference_result: Any) -> Dict[str, Any]: @@ -368,7 +368,7 @@ def semantic_comparison(self, generated_result: Any, reference_result: Any) -> D ) chain = prompt | self.llm_model | output_parser - return chain.invoke({ + return chain.ainvoke({ "generated_result": json.dumps(generated_result, indent=2), "reference_result": json.dumps(reference_result_dict, indent=2) }) diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py index 93ad9cf3..458c7212 100644 --- a/scrapegraphai/nodes/generate_scraper_node.py +++ b/scrapegraphai/nodes/generate_scraper_node.py @@ -131,7 +131,7 @@ def execute(self, state: dict) -> dict: ) map_chain = prompt | self.llm_model | StrOutputParser() - answer = map_chain.invoke({"question": user_prompt}) + answer = map_chain.ainvoke({"question": user_prompt}) state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/html_analyzer_node.py b/scrapegraphai/nodes/html_analyzer_node.py index 26304dcd..d3aa5819 100644 --- a/scrapegraphai/nodes/html_analyzer_node.py +++ b/scrapegraphai/nodes/html_analyzer_node.py @@ -93,7 +93,7 @@ def execute(self, state: dict) -> dict: output_parser = StrOutputParser() chain = prompt | self.llm_model | output_parser - html_analysis = chain.invoke({}) + html_analysis = chain.ainvoke({}) state.update({self.output[0]: html_analysis, self.output[1]: reduced_html}) return state diff --git a/scrapegraphai/nodes/merge_answers_node.py b/scrapegraphai/nodes/merge_answers_node.py index 9f9a356c..8f2b9aff 100644 --- a/scrapegraphai/nodes/merge_answers_node.py +++ b/scrapegraphai/nodes/merge_answers_node.py @@ -95,7 +95,7 @@ def execute(self, state: dict) -> dict: ) merge_chain = prompt_template | self.llm_model | output_parser - answer = merge_chain.invoke({"user_prompt": user_prompt}) + answer = merge_chain.ainvoke({"user_prompt": user_prompt}) answer["sources"] = state.get("urls", []) state.update({self.output[0]: answer}) diff --git a/scrapegraphai/nodes/merge_generated_scripts_node.py b/scrapegraphai/nodes/merge_generated_scripts_node.py index fad7af70..9a3469f0 100644 --- a/scrapegraphai/nodes/merge_generated_scripts_node.py +++ b/scrapegraphai/nodes/merge_generated_scripts_node.py @@ -74,7 +74,7 @@ def execute(self, state: dict) -> dict: ) merge_chain = prompt_template | self.llm_model | StrOutputParser() - answer = merge_chain.invoke({"user_prompt": user_prompt}) + answer = merge_chain.ainvoke({"user_prompt": user_prompt}) state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/prompt_refiner_node.py b/scrapegraphai/nodes/prompt_refiner_node.py index 66c960ff..afb86ca3 100644 --- a/scrapegraphai/nodes/prompt_refiner_node.py +++ b/scrapegraphai/nodes/prompt_refiner_node.py @@ -96,7 +96,7 @@ def execute(self, state: dict) -> dict: output_parser = StrOutputParser() chain = prompt | self.llm_model | output_parser - refined_prompt = chain.invoke({}) + refined_prompt = chain.ainvoke({}) state.update({self.output[0]: refined_prompt}) return state diff --git a/scrapegraphai/nodes/reasoning_node.py b/scrapegraphai/nodes/reasoning_node.py index 6b91155c..4a9ea290 100644 --- a/scrapegraphai/nodes/reasoning_node.py +++ b/scrapegraphai/nodes/reasoning_node.py @@ -91,7 +91,7 @@ def execute(self, state: dict) -> dict: output_parser = StrOutputParser() chain = prompt | self.llm_model | output_parser - refined_prompt = chain.invoke({}) + refined_prompt = chain.ainvoke({}) state.update({self.output[0]: refined_prompt}) return state diff --git a/scrapegraphai/nodes/robots_node.py b/scrapegraphai/nodes/robots_node.py index 2bb47e74..7e7303bf 100644 --- a/scrapegraphai/nodes/robots_node.py +++ b/scrapegraphai/nodes/robots_node.py @@ -108,7 +108,7 @@ def execute(self, state: dict) -> dict: ) chain = prompt | self.llm_model | output_parser - is_scrapable = chain.invoke({"path": source})[0] + is_scrapable = chain.ainvoke({"path": source})[0] if "no" in is_scrapable: self.logger.warning( diff --git a/scrapegraphai/nodes/search_link_node.py b/scrapegraphai/nodes/search_link_node.py index 10907850..d3fea2dc 100644 --- a/scrapegraphai/nodes/search_link_node.py +++ b/scrapegraphai/nodes/search_link_node.py @@ -142,7 +142,7 @@ def execute(self, state: dict) -> dict: input_variables=["content", "user_prompt"], ) merge_chain = merge_prompt | self.llm_model | output_parser - answer = merge_chain.invoke( + answer = merge_chain.ainvoke( {"content": chunk.page_content} ) relevant_links += answer diff --git a/scrapegraphai/utils/code_error_analysis.py b/scrapegraphai/utils/code_error_analysis.py index 77c8efdf..ea33671b 100644 --- a/scrapegraphai/utils/code_error_analysis.py +++ b/scrapegraphai/utils/code_error_analysis.py @@ -31,7 +31,7 @@ def syntax_focused_analysis(state: dict, llm_model) -> str: prompt = PromptTemplate(template=TEMPLATE_SYNTAX_ANALYSIS, input_variables=["generated_code", "errors"]) chain = prompt | llm_model | StrOutputParser() - return chain.invoke({ + return chain.ainvoke({ "generated_code": state["generated_code"], "errors": state["errors"]["syntax"] }) @@ -51,7 +51,7 @@ def execution_focused_analysis(state: dict, llm_model) -> str: input_variables=["generated_code", "errors", "html_code", "html_analysis"]) chain = prompt | llm_model | StrOutputParser() - return chain.invoke({ + return chain.ainvoke({ "generated_code": state["generated_code"], "errors": state["errors"]["execution"], "html_code": state["html_code"], @@ -73,7 +73,7 @@ def validation_focused_analysis(state: dict, llm_model) -> str: input_variables=["generated_code", "errors", "json_schema", "execution_result"]) chain = prompt | llm_model | StrOutputParser() - return chain.invoke({ + return chain.ainvoke({ "generated_code": state["generated_code"], "errors": state["errors"]["validation"], "json_schema": state["json_schema"], @@ -97,7 +97,7 @@ def semantic_focused_analysis(state: dict, comparison_result: Dict[str, Any], ll input_variables=["generated_code", "differences", "explanation"]) chain = prompt | llm_model | StrOutputParser() - return chain.invoke({ + return chain.ainvoke({ "generated_code": state["generated_code"], "differences": json.dumps(comparison_result["differences"], indent=2), "explanation": comparison_result["explanation"] diff --git a/scrapegraphai/utils/code_error_correction.py b/scrapegraphai/utils/code_error_correction.py index 98908360..1d838f86 100644 --- a/scrapegraphai/utils/code_error_correction.py +++ b/scrapegraphai/utils/code_error_correction.py @@ -33,7 +33,7 @@ def syntax_focused_code_generation(state: dict, analysis: str, llm_model) -> str prompt = PromptTemplate(template=TEMPLATE_SYNTAX_CODE_GENERATION, input_variables=["analysis", "generated_code"]) chain = prompt | llm_model | StrOutputParser() - return chain.invoke({ + return chain.ainvoke({ "analysis": analysis, "generated_code": state["generated_code"] }) @@ -53,7 +53,7 @@ def execution_focused_code_generation(state: dict, analysis: str, llm_model) -> prompt = PromptTemplate(template=TEMPLATE_EXECUTION_CODE_GENERATION, input_variables=["analysis", "generated_code"]) chain = prompt | llm_model | StrOutputParser() - return chain.invoke({ + return chain.ainvoke({ "analysis": analysis, "generated_code": state["generated_code"] }) @@ -73,7 +73,7 @@ def validation_focused_code_generation(state: dict, analysis: str, llm_model) -> prompt = PromptTemplate(template=TEMPLATE_VALIDATION_CODE_GENERATION, input_variables=["analysis", "generated_code", "json_schema"]) chain = prompt | llm_model | StrOutputParser() - return chain.invoke({ + return chain.ainvoke({ "analysis": analysis, "generated_code": state["generated_code"], "json_schema": state["json_schema"] @@ -94,7 +94,7 @@ def semantic_focused_code_generation(state: dict, analysis: str, llm_model) -> s prompt = PromptTemplate(template=TEMPLATE_SEMANTIC_CODE_GENERATION, input_variables=["analysis", "generated_code", "generated_result", "reference_result"]) chain = prompt | llm_model | StrOutputParser() - return chain.invoke({ + return chain.ainvoke({ "analysis": analysis, "generated_code": state["generated_code"], "generated_result": json.dumps(state["execution_result"], indent=2), From 026a70bd3a01b0ebab4d175ae4005e7f3ba3a833 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Sat, 12 Oct 2024 09:57:59 +0200 Subject: [PATCH 3/3] fix: bugs --- pyproject.toml | 1 + .../nodes/generate_answer_csv_node.py | 4 +- scrapegraphai/nodes/generate_answer_node.py | 41 +++++++++++++++++-- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b5b22bb1..e8cf382e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ keywords = [ "web scraping tool", "webscraping", "graph", + "llm" ] classifiers = [ "Intended Audience :: Developers", diff --git a/scrapegraphai/nodes/generate_answer_csv_node.py b/scrapegraphai/nodes/generate_answer_csv_node.py index ed58d4ba..11ab15b9 100644 --- a/scrapegraphai/nodes/generate_answer_csv_node.py +++ b/scrapegraphai/nodes/generate_answer_csv_node.py @@ -60,7 +60,7 @@ def __init__( self.additional_info = node_config.get("additional_info") - def execute(self, state): + async def execute(self, state): """ Generates an answer by constructing a prompt from the user's input and the scraped content, querying the language model, and parsing its response. @@ -157,7 +157,7 @@ def execute(self, state): ) merge_chain = merge_prompt | self.llm_model | output_parser - answer = merge_chain.ainvoke({"context": batch_results, "question": user_prompt}) + answer = await merge_chain.ainvoke({"context": batch_results, "question": user_prompt}) state.update({self.output[0]: answer}) return state diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py index 332f9c30..384d811d 100644 --- a/scrapegraphai/nodes/generate_answer_node.py +++ b/scrapegraphai/nodes/generate_answer_node.py @@ -1,3 +1,6 @@ +""" +GenerateAnswerNode Module +""" from typing import List, Optional from langchain.prompts import PromptTemplate from langchain_core.output_parsers import JsonOutputParser @@ -15,6 +18,26 @@ ) class GenerateAnswerNode(BaseNode): + """ + Initializes the GenerateAnswerNode class. + + Args: + input (str): The input data type for the node. + output (List[str]): The output data type(s) for the node. + node_config (Optional[dict]): Configuration dictionary for the node, + which includes the LLM model, verbosity, schema, and other settings. + Defaults to None. + node_name (str): The name of the node. Defaults to "GenerateAnswer". + + Attributes: + llm_model: The language model specified in the node configuration. + verbose (bool): Whether verbose mode is enabled. + force (bool): Whether to force certain behaviors, overriding defaults. + script_creator (bool): Whether the node is in script creation mode. + is_md_scraper (bool): Whether the node is scraping markdown data. + additional_info (Optional[str]): Any additional information to be + included in the prompt templates. + """ def __init__( self, input: str, @@ -34,7 +57,17 @@ def __init__( self.is_md_scraper = node_config.get("is_md_scraper", False) self.additional_info = node_config.get("additional_info") - def execute(self, state: dict) -> dict: + async def execute(self, state: dict) -> dict: + """ + Executes the GenerateAnswerNode. + + Args: + state (dict): The current state of the graph. The input keys will be used + to fetch the correct data from the state. + + Returns: + dict: The updated state with the output key containing the generated answer. + """ self.logger.info(f"--- Executing {self.node_name} Node ---") input_keys = self.get_input_keys(state) @@ -90,7 +123,7 @@ def execute(self, state: dict) -> dict: chain = prompt | self.llm_model if output_parser: chain = chain | output_parser - answer = chain.ainvoke({"question": user_prompt}) + answer = await chain.ainvoke({"question": user_prompt}) state.update({self.output[0]: answer}) return state @@ -110,7 +143,7 @@ def execute(self, state: dict) -> dict: chains_dict[chain_name] = chains_dict[chain_name] | output_parser async_runner = RunnableParallel(**chains_dict) - batch_results = async_runner.invoke({"question": user_prompt}) + batch_results = await async_runner.ainvoke({"question": user_prompt}) merge_prompt = PromptTemplate( template=template_merge_prompt, @@ -121,7 +154,7 @@ def execute(self, state: dict) -> dict: merge_chain = merge_prompt | self.llm_model if output_parser: merge_chain = merge_chain | output_parser - answer = merge_chain.ainvoke({"context": batch_results, "question": user_prompt}) + answer = await merge_chain.ainvoke({"context": batch_results, "question": user_prompt}) state.update({self.output[0]: answer}) return state