coolbeevip
diff --git a/‎camel_database_agent/cli.py‎
Lines changed: 4 additions & 1 deletion b/‎camel_database_agent/cli.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎camel_database_agent/database/database_schema_parse.py‎
Lines changed: 66 additions & 19 deletions b/‎camel_database_agent/database/database_schema_parse.py‎
Lines changed: 66 additions & 19 deletions
diff --git a/‎camel_database_agent/database/dialect/database_schema_dialect.py‎
Lines changed: 28 additions & 25 deletions b/‎camel_database_agent/database/dialect/database_schema_dialect.py‎
Lines changed: 28 additions & 25 deletions
@@ -151,11 +151,13 @@ def main() -> None:
         language=args.language,
         data_path=data_path,
     )
-    database_agent.train_knowledge(
+    token_usage = database_agent.train_knowledge(
         level=TrainLevel.MEDIUM,
         reset_train=args.reset_train,
     )
 
+    print(f"{Fore.GREEN}")
+    print("=" * 50)
     print(f"{Fore.GREEN}Database Overview")
     print("=" * 50)
     print(f"{database_agent.get_summary()}")
@@ -173,6 +175,7 @@ def main() -> None:
         f"{Fore.CYAN}Type {Fore.LIGHTYELLOW_EX}'help'{Fore.RESET} "
         f"to get more recommended questions"
     )
+    print(f"{Fore.CYAN}Training completed, using {token_usage.total_tokens} tokens{Fore.RESET}")
     print(f"{Fore.CYAN}=" * 50)
 
     session_id = str(uuid.uuid4())
 
@@ -1,4 +1,5 @@
-from typing import List, Union
+import textwrap
+from typing import List, Optional, TypeVar, Union
 
 from camel.agents import ChatAgent
 from camel.models import BaseModelBackend
@@ -17,7 +18,7 @@ class DDLRecord(BaseModel):
 class DMLRecord(BaseModel):
     id: str
     summary: str
-    sql: str
+    dataset: str
 
 
 class QueryRecord(BaseModel):
@@ -26,6 +27,14 @@ class QueryRecord(BaseModel):
     sql: str
 
 
+RecordType = TypeVar("RecordType", DDLRecord, DMLRecord, QueryRecord)
+
+
+class SchemaParseResponse(BaseModel):
+    data: List[RecordType]
+    usage: Optional[dict]
+
+
 class DDLRecordResponseFormat(BaseModel):
     items: List[DDLRecord]
 
@@ -52,36 +61,74 @@ def __init__(
         )
 
     @timing
-    def parse_ddl_record(self, text: str) -> List[DDLRecord]:
+    def parse_ddl_record(self, text: str) -> SchemaParseResponse:
         """Parsing DDL SQL statements"""
         prompt = (
-            "The following are some DDL script. Please read the script in its "
-            "entirety and provide descriptions for the tables and fields to "
-            "generate summary information and extract the SQL script for each "
-            "table.\n\n"
+            "Translate the following information into a JSON array format, "
+            "with each JSON object in the array containing three "
+            "elements: "
+            "\"id\" for the table name, "
+            "\"summary\" for a summary of the table, and "
+            "\"sql\" for the SQL statement of the table creation.\n\n"
         )
-        prompt += f"```sql\n{text}```\n\n"
-        prompt += "Please output the summary information and SQL script in JSON format."
+        if text.startswith("```sql"):
+            prompt += f"{text}\n\n"
+        else:
+            prompt += f"```sql\n{text}```\n\n"
+
+        # 非 openai 模型要增加以下片段
+        prompt += textwrap.dedent(
+            "Output Format:\n"
+            "{"
+            "    \"items\":"
+            "        ["
+            "            {"
+            "                \"id\": \"<table name>\","
+            "                \"summary\": \"<table summary>\","
+            "                \"sql\": \"<table ddl script>\""
+            "            }"
+            "        ]"
+            "}\n\n"
+        )
+        prompt += "Now, directly output the JSON array without explanation."
         response = self.parsing_agent.step(prompt, response_format=DDLRecordResponseFormat)
         ddl_record_response = DDLRecordResponseFormat.model_validate_json(response.msgs[0].content)
-        return ddl_record_response.items
+        return SchemaParseResponse(data=ddl_record_response.items, usage=response.info["usage"])
 
     @timing
-    def parse_dml_record(self, text: str) -> List[DMLRecord]:
+    def parse_dml_record(self, text: str) -> SchemaParseResponse:
         """Parsing DML SQL statements"""
         prompt = (
-            "The following are some DML statements from which you need "
-            "to extract table names, field names, and generate summary "
-            "information, as well as extract each SQL statement.\n\n"
+            "Translate the following information into a JSON array format, "
+            "with each JSON object in the array containing three "
+            "elements: "
+            "\"id\" for the table name, "
+            "\"summary\" for a summary of the table, and "
+            "\"dataset\" for the Markdown of the data.\n\n"
         )
-        prompt += f"```sql\n{text}```\n"
-        prompt += "Please output the summary information and SQL script in JSON format."
+        prompt += f"{text}\n\n"
+
+        # 非 openai 模型要增加以下片段
+        prompt += textwrap.dedent(
+            "Output Format:\n"
+            "{"
+            "    \"items\":"
+            "        ["
+            "            {"
+            "                \"id\": \"<table name>\","
+            "                \"summary\": \"<table summary>\","
+            "                \"dataset\": \"<markdown dataset>\""
+            "            }"
+            "        ]"
+            "}\n\n"
+        )
+        prompt += "Now, directly output the JSON array without explanation."
         response = self.parsing_agent.step(prompt, response_format=DMLRecordResponseFormat)
         dml_record_response = DMLRecordResponseFormat.model_validate_json(response.msgs[0].content)
-        return dml_record_response.items
+        return SchemaParseResponse(data=dml_record_response.items, usage=response.info["usage"])
 
     @timing
-    def parse_query_record(self, text: str) -> List[QueryRecord]:
+    def parse_query_record(self, text: str) -> SchemaParseResponse:
         """Parsing Query SQL statements"""
         prompt = (
             "The following is an analysis of user query requirements, "
@@ -94,4 +141,4 @@ def parse_query_record(self, text: str) -> List[QueryRecord]:
         query_record_response = QueryRecordResponseFormat.model_validate_json(
             response.msgs[0].content
         )
-        return query_record_response.items
+        return SchemaParseResponse(data=query_record_response.items, usage=response.info["usage"])
@@ -4,6 +4,7 @@
 
 from camel.agents import ChatAgent
 from camel.models import BaseModelBackend
+from tabulate import tabulate
 
 from camel_database_agent.database.database_manager import DatabaseManager
 from camel_database_agent.database_prompt import POLISH_SCHEMA_OUTPUT_EXAMPLE
@@ -81,38 +82,40 @@ def get_sampled_data(self, data_samples_size: int = 5) -> str:
         Must be implemented by all dialect subclasses.
         """
         metadata = self.database_manager.get_metadata()
-        sample_data_sql = []
+        sample_data = []
 
         for table_name in metadata.tables:
-            table = metadata.tables[table_name]
-            column_names = [column.name for column in table.columns]
+            # table = metadata.tables[table_name]
+            # column_names = [column.name for column in table.columns]
 
             sample_query = f"SELECT * FROM {table_name} LIMIT {data_samples_size}"
             try:
                 rows = self.database_manager.select(sample_query)
-                for row in rows:
-                    columns = []
-                    values = []
-
-                    for col_name in column_names:
-                        if col_name in row and row[col_name] is not None:
-                            columns.append(col_name)
-                            if isinstance(row[col_name], str):
-                                values.append("'" + row[col_name].replace("'", "''") + "'")
-                            elif isinstance(row[col_name], (int, float)):
-                                values.append(str(row[col_name]))
-                            else:
-                                values.append(f"'{row[col_name]!s}'")
-
-                    if columns and values:
-                        columns_stmt = ', '.join(columns)
-                        values_stmt = ', '.join(values)
-                        insert_stmt = (
-                            f"INSERT INTO {table_name} ({columns_stmt}) VALUES ({values_stmt});"
-                        )
-                        sample_data_sql.append(insert_stmt)
+                dataset = tabulate(tabular_data=rows, headers='keys', tablefmt='psql')
+                sample_data.append(f"## {table_name}\n\n{dataset}")
+                # for row in rows:
+                #     columns = []
+                #     values = []
+                #
+                #     for col_name in column_names:
+                #         if col_name in row and row[col_name] is not None:
+                #             columns.append(col_name)
+                #             if isinstance(row[col_name], str):
+                #                 values.append("'" + row[col_name].replace("'", "''") + "'")
+                #             elif isinstance(row[col_name], (int, float)):
+                #                 values.append(str(row[col_name]))
+                #             else:
+                #                 values.append(f"'{row[col_name]!s}'")
+                #
+                #     if columns and values:
+                #         columns_stmt = ', '.join(columns)
+                #         values_stmt = ', '.join(values)
+                #         insert_stmt = (
+                #             f"INSERT INTO {table_name} ({columns_stmt}) VALUES ({values_stmt});"
+                #         )
+                #         sample_data_sql.append(insert_stmt)
 
             except Exception as e:
                 logger.warning(f"Error sampling data from table {table_name}: {e}")
 
-        return "\n".join(sample_data_sql)
+        return "\n\n".join(sample_data)