Merge pull request #9 from VinciGit00/pre/beta

vedovati-matteo · web-flow · commit 04fdb5d90005 · 2024-09-25T10:07:56.000+02:00
add possiibility to save the code
diff --git a/examples/code_generation/simple_with_schema.py b/examples/code_generation/simple_with_schema.py
@@ -42,6 +42,7 @@ class Projects(BaseModel):
         "validation": 3,
         "semantic": 3
     },
+    "output_file_name": "extracted_data.py"
 }
 
 # ************************************************
diff --git a/extract_data.py b/extract_data.py
@@ -0,0 +1,27 @@
+def extract_data(html: str) -> dict:
+    from bs4 import BeautifulSoup
+
+    # Parse the HTML content using BeautifulSoup
+    soup = BeautifulSoup(html, 'html.parser')
+
+    # Initialize an empty list to hold project data
+    projects = []
+
+    # Find all project entries in the HTML
+    project_entries = soup.find_all('div', class_='grid-item')
+
+    # Iterate over each project entry to extract title and description
+    for entry in project_entries:
+        # Extract the title from the h4 element
+        title = entry.find('h4', class_='card-title').get_text(strip=True)
+        # Extract the description from the p element
+        description = entry.find('p', class_='card-text').get_text(strip=True)
+
+        # Append the extracted data as a dictionary to the projects list
+        projects.append({
+            'title': title,
+            'description': description
+        })
+
+    # Return the structured data as a dictionary matching the desired JSON schema
+    return {'projects': projects}
diff --git a/scrapegraphai/graphs/code_generator_graph.py b/scrapegraphai/graphs/code_generator_graph.py
@@ -17,17 +17,17 @@
 
 class CodeGeneratorGraph(AbstractGraph):
     """
-    CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for 
-    extarcting the wanted informations from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
-    It requires a user prompt, a source URL, and a output schema.
-    
+    CodeGeneratorGraph is a script generator pipeline that generates the function extract_data(html: str) -> dict() for
+    extracting the wanted information from a HTML page. The code generated is in Python and uses the library BeautifulSoup.
+    It requires a user prompt, a source URL, and an output schema.
+
     Attributes:
         prompt (str): The prompt for the graph.
         source (str): The source of the graph.
         config (dict): Configuration parameters for the graph.
         schema (BaseModel): The schema for the graph output.
         llm_model: An instance of a language model client, configured for generating answers.
-        embedder_model: An instance of an embedding model client, 
+        embedder_model: An instance of an embedding model client,
         configured for generating embeddings.
         verbose (bool): A flag indicating whether to show print statements during execution.
         headless (bool): A flag indicating whether to run the graph in headless mode.
@@ -96,7 +96,6 @@ def _create_graph(self) -> BaseGraph:
                 "schema": self.schema,
             }
         )
-        
         prompt_refier_node = PromptRefinerNode(
             input="user_prompt",
             output=["refined_prompt"],
@@ -106,7 +105,6 @@ def _create_graph(self) -> BaseGraph:
                 "schema": self.schema
             }
         )
-        
         html_analyzer_node = HtmlAnalyzerNode(
             input="refined_prompt & original_html",
             output=["html_info", "reduced_html"],
@@ -117,7 +115,6 @@ def _create_graph(self) -> BaseGraph:
                 "reduction": self.config.get("reduction", 0)
             }
         )
-        
         generate_code_node = GenerateCodeNode(
             input="user_prompt & refined_prompt & html_info & reduced_html & answer",
             output=["generated_code"],
@@ -166,4 +163,26 @@ def run(self) -> str:
         inputs = {"user_prompt": self.prompt, self.input_key: self.source}
         self.final_state, self.execution_info = self.graph.execute(inputs)
 
-        return self.final_state.get("generated_code", "No code created.")
+        generated_code = self.final_state.get("generated_code", "No code created.")
+
+        if self.config.get("filename") is None:
+            filename = "extracted_data.py"
+        elif ".py" not in self.config.get("filename"):
+            filename += ".py"
+        else:
+            filename = self.config.get("filename")
+
+        self.save_code_to_file(generated_code, filename)
+
+        return generated_code
+
+    def save_code_to_file(self, code: str, filename:str) -> None:
+        """
+        Saves the generated code to a Python file.
+
+        Args:
+            code (str): The generated code to be saved.
+            filename (str): name of the output file
+        """
+        with open(filename, "w") as file:
+            file.write(code)

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,7 @@ class Projects(BaseModel):`
`42`	`42`	`"validation": 3,`
`43`	`43`	`"semantic": 3`
`44`	`44`	`},`
	`45`	`+ "output_file_name": "extracted_data.py"`
`45`	`46`	`}`
`46`	`47`
`47`	`48`	`# ************************************************`