Rename parse_dml_record to parse_sampled_record and update prompt templates for JSON output

coolbeevip · coolbeevip · commit d5d0b34eab35 · 2025-04-16T21:39:14.000+08:00
diff --git a/camel_database_agent/database/prompts.py b/camel_database_agent/database/prompts.py
@@ -37,3 +37,34 @@ class PromptTemplates:
     - Specify the expected format and content of comments
     - Emphasize professionalism and conciseness
     """)
+
+    PARSE_SAMPLED_RECORD = textwrap.dedent("""
+    # JSON Format Request
+    You are a specialized JSON generator. Your only function is to parse the provided data and convert it to JSON format, strictly following the format requirements.
+    
+    ## Input Data:
+    {{section}}
+    
+    ## Instructions:
+    1. Create a JSON array with each table as an object
+    2. Each object must have exactly three fields:
+       - "id": the table name
+       - "summary": a brief description of the table
+       - "dataset": the data in markdown format
+    3. The entire response must be ONLY valid JSON without any additional text, explanation, or markdown code blocks
+    
+    ## Required Output Format:
+    {
+        "items":[{
+            "id": "<table name>",
+            "summary": "<table summary>",
+            "dataset": "<markdown dataset>"
+        }]
+    }
+    
+    ## IMPORTANT:
+    - Your response must contain ONLY the JSON object, nothing else
+    - Do not include explanations, introductions, or conclusions
+    - Do not use markdown code blocks (```) around the JSON
+    - Do not include phrases like "Here's the JSON" or "I've created the JSON"
+    - Do not indicate that you are providing the output in any way""")
diff --git a/camel_database_agent/database/schema.py b/camel_database_agent/database/schema.py
@@ -1,3 +1,5 @@
+import logging
+import re
 import textwrap
 from typing import Generic, List, Optional, TypeVar, Union
 
@@ -6,8 +8,11 @@
 from pydantic import BaseModel
 
 from camel_database_agent.database.manager import DatabaseManager
+from camel_database_agent.database.prompts import PromptTemplates
 from camel_database_agent.database_base import timing
 
+logger = logging.getLogger(__name__)
+
 
 class DDLRecord(BaseModel):
     id: str
@@ -97,36 +102,30 @@ def parse_ddl_record(self, text: str) -> SchemaParseResponse:
         return SchemaParseResponse(data=ddl_record_response.items, usage=response.info["usage"])
 
     @timing
-    def parse_dml_record(self, text: str) -> SchemaParseResponse:
+    def parse_sampled_record(self, text: str) -> SchemaParseResponse:
         """Parsing Sampled Data"""
-        prompt = (
-            "Translate the following information into a JSON array format, "
-            "with each JSON object in the array containing three "
-            "elements: "
-            "\"id\" for the table name, "
-            "\"summary\" for a summary of the table, and "
-            "\"dataset\" for the Markdown of the data.\n\n"
-        )
-        prompt += f"{text}\n\n"
-
-        # 非 openai 模型要增加以下片段
-        prompt += textwrap.dedent(
-            "Output Format:\n"
-            "{"
-            "    \"items\":"
-            "        ["
-            "            {"
-            "                \"id\": \"<table name>\","
-            "                \"summary\": \"<table summary>\","
-            "                \"dataset\": \"<markdown dataset>\""
-            "            }"
-            "        ]"
-            "}\n\n"
-        )
-        prompt += "Now, directly output the JSON array without explanation."
-        response = self.parsing_agent.step(prompt, response_format=DMLRecordResponseFormat)
-        dml_record_response = DMLRecordResponseFormat.model_validate_json(response.msgs[0].content)
-        return SchemaParseResponse(data=dml_record_response.items, usage=response.info["usage"])
+        data: List[DMLRecord] = []
+        usage: Optional[dict] = None
+        sections = self.split_markdown_by_h2(text)
+        for section in sections:
+            prompt = PromptTemplates.PARSE_SAMPLED_RECORD.replace("{{section}}", section)
+            try:
+                self.parsing_agent.reset()
+                response = self.parsing_agent.step(prompt, response_format=DMLRecordResponseFormat)
+                dml_record_response = DMLRecordResponseFormat.model_validate_json(
+                    response.msgs[0].content
+                )
+                data.extend(dml_record_response.items)
+                if usage is None:
+                    usage = response.info["usage"]
+                else:
+                    usage["completion_tokens"] += response.info["usage"]["completion_tokens"]
+                    usage["prompt_tokens"] += response.info["usage"]["prompt_tokens"]
+                    usage["total_tokens"] += response.info["usage"]["total_tokens"]
+            except Exception as e:
+                logger.error(f"Unable to process messages: {e}")
+                logger.error(f"Prompt: {prompt}")
+        return SchemaParseResponse(data=data, usage=usage)
 
     @timing
     def parse_query_record(self, text: str) -> SchemaParseResponse:
@@ -143,3 +142,8 @@ def parse_query_record(self, text: str) -> SchemaParseResponse:
             response.msgs[0].content
         )
         return SchemaParseResponse(data=query_record_response.items, usage=response.info["usage"])
+
+    def split_markdown_by_h2(self, markdown_text):
+        sections = re.split(r'(?=^##\s+)', markdown_text, flags=re.MULTILINE)
+        sections = [section.strip() for section in sections if section.strip()]
+        return sections
diff --git a/camel_database_agent/database_agent.py b/camel_database_agent/database_agent.py
@@ -236,7 +236,7 @@ def _parse_sampled_data_to_knowledge(self, data_samples_size: int = 5) -> TokenU
         ) as f:
             f.write(self.data_sql)
 
-        schema_parse_response: SchemaParseResponse = self.schema_parse.parse_dml_record(
+        schema_parse_response: SchemaParseResponse = self.schema_parse.parse_sampled_record(
             self.data_sql
         )
 
diff --git a/camel_database_agent/datagen/pipeline.py b/camel_database_agent/datagen/pipeline.py
@@ -55,6 +55,7 @@ def _prepare_prompt(self, query_samples_needed: int) -> str:
         prompt = prompt.replace("{{ddl_sql}}", self.ddl_sql)
         prompt = prompt.replace("{{data_sql}}", self.data_sql)
         prompt = prompt.replace("{{query_samples_size}}", str(query_samples_needed))
+        prompt = prompt.replace("{{dialect_name}}", self.database_manager.dialect_name())
         return prompt
 
     def _parse_response_content(self, content: str) -> List[QueryRecord]:
@@ -76,7 +77,7 @@ def _validate_query(self, query_record: QueryRecord) -> bool:
             self.database_manager.select(query_record.sql)
             return True
         except SQLExecutionError as e:
-            logger.error(f"{Fore.RED}SQLExecutionError{Fore.RESET}: {e.sql} {e.error_message}")
+            logger.debug(f"{Fore.RED}SQLExecutionError{Fore.RESET}: {e.sql} {e.error_message}")
             return False
         except Exception as e:
             logger.error(
diff --git a/camel_database_agent/datagen/prompts.py b/camel_database_agent/datagen/prompts.py
@@ -3,16 +3,44 @@
 
 class PromptTemplates:
     QUESTION_INFERENCE_PIPELINE = textwrap.dedent("""
-    Please carefully analyze the following database information and conduct an in-depth analysis from a business perspective. What business query questions might users raise? Please fully consider some complex query scenarios, including but not limited to multi-table associations, grouping statistics, etc.
+    # JSON Format Request
+    
+    You are a specialized JSON generator. Your only function is to parse the provided data and convert it to JSON format, strictly following the format requirements.    
 
-    Database Schema:
+    ## Database Schema:
     ```
     {{ddl_sql}}
     ```
 
-    Data Example:
+    ## Data Example:
     ```sql
     {{data_sql}}
     ```
 
-    Now, Please generate {{query_samples_size}} real user query questions along with the corresponding SQL query statements without using placeholders. Please output in JSON format.""")
+    ## Instructions:
+    Database System: {{dialect_name}}
+    1. Please carefully analyze the following database information and conduct an in-depth analysis from a business perspective. What business query questions might users raise? Please fully consider some complex query scenarios, including but not limited to multi-table associations, grouping statistics, etc.
+    2. Please ensure that the SQL you write conforms to {{dialect_name}} syntax.
+    3. Generate {{query_samples_size}} real user query questions along with the corresponding SQL query statements without using placeholders
+    4. Create a JSON array with each table as an object
+    5. Each object must have exactly three fields:
+       - "id": the table name
+       - "question": a query in natural language.
+       - "sql": sql statements without placeholders.
+    6. The entire response must be ONLY valid JSON without any additional text, explanation, or markdown code blocks
+    
+    ## Required Output Format:
+    {
+        "items":[{
+            "id": "<table name>",
+            "question": "<a query in natural language>",
+            "sql": "<sql statements>"
+        }]
+    }
+    
+    ## IMPORTANT:
+    - Your response must contain ONLY the JSON object, nothing else
+    - Do not include explanations, introductions, or conclusions
+    - Do not use markdown code blocks (```) around the JSON
+    - Do not include phrases like "Here's the JSON" or "I've created the JSON"
+    - Do not indicate that you are providing the output in any way.""")
diff --git a/tests/integration_tests/test_database_schema_parse.py b/tests/integration_tests/test_database_schema_parse.py
@@ -37,7 +37,7 @@ def test_parse_ddl_record(self) -> None:
     def test_parse_dml_record(self) -> None:
         current_dir = os.path.dirname(os.path.abspath(__file__))
         with open(os.path.join(current_dir, "data.sql"), "r") as f:
-            schema_parse_response: SchemaParseResponse = self.parse.parse_dml_record(f.read())
+            schema_parse_response: SchemaParseResponse = self.parse.parse_sampled_record(f.read())
             assert len(schema_parse_response.data) == 6
 
     def test_parse_query_record(self) -> None:

Original file line number	Diff line number	Diff line change
`@@ -236,7 +236,7 @@ def _parse_sampled_data_to_knowledge(self, data_samples_size: int = 5) -> TokenU`
`236`	`236`	`) as f:`
`237`	`237`	`f.write(self.data_sql)`
`238`	`238`
`239`		`- schema_parse_response: SchemaParseResponse = self.schema_parse.parse_dml_record(`
	`239`	`+ schema_parse_response: SchemaParseResponse = self.schema_parse.parse_sampled_record(`
`240`	`240`	`self.data_sql`
`241`	`241`	`)`
`242`	`242`