update new data loader design

Chenglong-MS · Chenglong-MS · commit 596686ab6750 · 2025-05-08T18:16:55.000-07:00
diff --git a/py-src/data_formulator/agent_routes.py b/py-src/data_formulator/agent_routes.py
@@ -29,7 +29,7 @@
 from data_formulator.agents.agent_data_load import DataLoadAgent
 from data_formulator.agents.agent_data_clean import DataCleanAgent
 from data_formulator.agents.agent_code_explanation import CodeExplanationAgent
-
+from data_formulator.agents.agent_query_completion import QueryCompletionAgent
 from data_formulator.agents.client_utils import Client
 
 from data_formulator.db_manager import db_manager
@@ -437,4 +437,25 @@ def request_code_expl():
         expl = code_expl_agent.run(input_tables, code)
     else:
         expl = ""
-    return expl
+    return expl
+
+@agent_bp.route('/query-completion', methods=['POST'])
+def query_completion():
+    if request.is_json:
+        logger.info("# request data: ")
+        content = request.get_json()        
+
+        client = get_client(content['model'])
+
+        data_source_metadata = content["data_source_metadata"]
+        query = content["query"]
+
+        
+        query_completion_agent = QueryCompletionAgent(client=client)
+        reasoning, query = query_completion_agent.run(data_source_metadata, query)
+        response = flask.jsonify({ "token": "", "status": "ok", "reasoning": reasoning, "query": query })
+    else:
+        response = flask.jsonify({ "token": "", "status": "error", "reasoning": "unable to complete query", "query": "" })
+
+    response.headers.add('Access-Control-Allow-Origin', '*')
+    return response
diff --git a/py-src/data_formulator/agents/agent_query_completion.py b/py-src/data_formulator/agents/agent_query_completion.py
@@ -0,0 +1,80 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import pandas as pd
+import json
+
+from data_formulator.agents.agent_utils import extract_code_from_gpt_response, extract_json_objects
+import re
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+SYSTEM_PROMPT = '''You are a data scientist to help with data queries. 
+The user will provide you with a description of the data source and tables available in the [DATA SOURCE] section and a query in the [USER INPUTS] section. 
+You will need to help the user complete the query and provide reasoning for the query you generated in the [OUTPUT] section.
+
+Input format:
+* The data source description is a json object with the following fields:
+    * `data_source`: the name of the data source
+    * `tables`: a list of tables in the data source, which maps the table name to the list of columns available in the table.
+* The user input is a natural language description of the query or a partial query you need to complete.
+
+Steps:
+* Based on data source description and user input, you should first decide on what language should be used to query the data. 
+* Then, describe the logic for the query you generated in a json object in a block ```json``` with the following fields:
+    * `language`: the language of the query you generated
+    * `tables`: the names of the tables you will use in the query
+    * `logic`: the reasoning behind why you chose the tables and the logic for the query you generated
+* Finally, generate the complete query in the language specified in a code block ```{language}```.
+
+Output format:
+* The output should be in the following format, no other text should be included:
+
+[REASONING]
+```json
+{
+    "language": {language},
+    "tables": {tables},
+    "logic": {logic}
+}
+```
+
+[QUERY]
+```{language}   
+{query}
+```
+'''
+
+class QueryCompletionAgent(object):
+
+    def __init__(self, client):
+        self.client = client
+
+    def run(self, data_source_metadata, query):
+
+        user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n[REASONING]\n"
+
+        logger.info(user_query)
+
+        messages = [{"role":"system", "content": SYSTEM_PROMPT},
+                    {"role":"user","content": user_query}]
+        
+        ###### the part that calls open_ai
+        response = self.client.get_completion(messages = messages)
+        response_content = '[REASONING]\n' + response.choices[0].message.content
+        
+        logger.info(f"=== query completion output ===>\n{response_content}\n")
+
+        reasoning = extract_json_objects(response_content.split("[REASONING]")[1].split("[QUERY]")[0].strip())[0]
+        output_query = response_content.split("[QUERY]")[1].strip()
+        
+        # Extract the query by removing the language markers
+        language_pattern = r"```(\w+)\s+(.*?)```"
+        match = re.search(language_pattern, output_query, re.DOTALL)
+        if match:
+            output_query = match.group(2).strip()
+
+        return reasoning, output_query
diff --git a/py-src/data_formulator/data_loader/external_data_loader.py b/py-src/data_formulator/data_loader/external_data_loader.py
@@ -5,6 +5,41 @@
 import duckdb
 import random
 import string
+import re
+
+def sanitize_table_name(name_as: str) -> str:
+    if not name_as:
+        raise ValueError("Table name cannot be empty")
+    
+    # Remove any SQL injection attempts
+    name_as = name_as.replace(";", "").replace("--", "").replace("/*", "").replace("*/", "")
+    
+    # Replace invalid characters with underscores
+    # This includes special characters, spaces, dots, dashes, and other non-alphanumeric chars
+    sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', name_as)
+    
+    # Ensure the name starts with a letter or underscore
+    if not sanitized[0].isalpha() and sanitized[0] != '_':
+        sanitized = '_' + sanitized
+    
+    # Ensure the name is not a SQL keyword
+    sql_keywords = {
+        'SELECT', 'FROM', 'WHERE', 'GROUP', 'BY', 'ORDER', 'HAVING', 'LIMIT',
+        'OFFSET', 'JOIN', 'INNER', 'LEFT', 'RIGHT', 'FULL', 'OUTER', 'ON',
+        'AND', 'OR', 'NOT', 'NULL', 'TRUE', 'FALSE', 'UNION', 'ALL', 'DISTINCT',
+        'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'DROP', 'TABLE', 'VIEW', 'INDEX',
+        'ALTER', 'ADD', 'COLUMN', 'PRIMARY', 'KEY', 'FOREIGN', 'REFERENCES',
+        'CONSTRAINT', 'DEFAULT', 'CHECK', 'UNIQUE', 'CASCADE', 'RESTRICT'
+    }
+    
+    if sanitized.upper() in sql_keywords:
+        sanitized = '_' + sanitized
+    
+    # Ensure the name is not too long (common SQL limit is 63 characters)
+    if len(sanitized) > 63:
+        sanitized = sanitized[:63]
+    
+    return sanitized
 
 class ExternalDataLoader(ABC):
     
@@ -45,6 +80,10 @@ def list_tables(self) -> List[Dict[str, Any]]:
     def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000):
         pass
 
+    @abstractmethod
+    def view_query_sample(self, query: str) -> str:
+        pass
+
     @abstractmethod
     def ingest_data_from_query(self, query: str, name_as: str):
         pass
diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py
@@ -8,10 +8,8 @@
 from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
 from azure.kusto.data.helpers import dataframe_from_result_table
 
-from data_formulator.data_loader.external_data_loader import ExternalDataLoader
+from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name
 
-def sanitize_table_name(table_name: str) -> str:
-    return table_name.replace(".", "_").replace("-", "_")
 
 class KustoDataLoader(ExternalDataLoader):
 
@@ -53,8 +51,6 @@ def query(self, kql: str) -> pd.DataFrame:
         return dataframe_from_result_table(result.primary_results[0])
 
     def list_tables(self) -> List[Dict[str, Any]]:
-
-
         # first list functions (views)
         query = ".show functions"
         function_result_df = self.query(query)
@@ -170,6 +166,8 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000)
             
             total_rows_ingested += len(chunk_df)
 
+    def view_query_sample(self, query: str) -> str:
+        return self.query(query).head(10).to_dict(orient="records")
 
     def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
         # Sanitize the table name for SQL compatibility
diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py
@@ -3,7 +3,7 @@
 import pandas as pd
 import duckdb
 
-from data_formulator.data_loader.external_data_loader import ExternalDataLoader
+from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name
 from typing import Dict, Any
 
 class MySQLDataLoader(ExternalDataLoader):
@@ -31,6 +31,12 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti
             if value:
                 attatch_string += f"{key}={value} "
 
+        # Detach existing mysqldb connection if it exists
+        try:
+            self.duck_db_conn.execute("DETACH mysqldb;")
+        except:
+            pass  # Ignore if mysqldb doesn't exist
+
         # Register MySQL connection
         self.duck_db_conn.execute(f"ATTACH '{attatch_string}' AS mysqldb (TYPE mysql);")
 
@@ -44,21 +50,21 @@ def list_tables(self):
         
         for schema, table_name in tables_df.values:
 
-            full_table_name = f"{schema}.{table_name}"
+            full_table_name = f"mysqldb.{schema}.{table_name}"
 
             # Get column information using DuckDB's information schema
-            columns_df = self.duck_db_conn.execute(f"DESCRIBE mysqldb.{full_table_name}").df()
+            columns_df = self.duck_db_conn.execute(f"DESCRIBE {full_table_name}").df()
             columns = [{
                 'name': row['column_name'],
                 'type': row['column_type']
             } for _, row in columns_df.iterrows()]
             
             # Get sample data
-            sample_df = self.duck_db_conn.execute(f"SELECT * FROM mysqldb.{full_table_name} LIMIT 10").df()
+            sample_df = self.duck_db_conn.execute(f"SELECT * FROM {full_table_name} LIMIT 10").df()
             sample_rows = json.loads(sample_df.to_json(orient="records"))
             
             # get row count
-            row_count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM mysqldb.{full_table_name}").fetchone()[0]
+            row_count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM {full_table_name}").fetchone()[0]
 
             table_metadata = {
                 "row_count": row_count,
@@ -73,19 +79,24 @@ def list_tables(self):
             
         return results
 
-    def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000):
+    def ingest_data(self, table_name: str, name_as: str | None = None, size: int = 1000000):
         # Create table in the main DuckDB database from MySQL data
         if name_as is None:
             name_as = table_name.split('.')[-1]
 
+        name_as = sanitize_table_name(name_as)
+
         self.duck_db_conn.execute(f"""
-            CREATE OR REPLACE TABLE {name_as} AS 
-            SELECT * FROM mysqldb.{table_name} 
+            CREATE OR REPLACE TABLE main.{name_as} AS 
+            SELECT * FROM {table_name} 
             LIMIT {size}
         """)
 
+    def view_query_sample(self, query: str) -> str:
+        return self.duck_db_conn.execute(query).df().head(10).to_dict(orient="records")
+
     def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
-        self.duck_db_conn.execute(f"""
-            CREATE OR REPLACE TABLE main.{name_as} AS 
-            SELECT * FROM ({query})
-        """)
+        # Execute the query and get results as a DataFrame
+        df = self.duck_db_conn.execute(query).df()
+        # Use the base class's method to ingest the DataFrame
+        self.ingest_df_to_duckdb(df, name_as)
diff --git a/py-src/data_formulator/tables_routes.py b/py-src/data_formulator/tables_routes.py
@@ -691,17 +691,20 @@ def sanitize_db_error_message(error: Exception) -> Tuple[str, int]:
     # Define patterns for known safe errors
     safe_error_patterns = {
         # Database table errors
-        r"Table.*does not exist": ("Specified table was not found", 404),
-        r"Table.*already exists": ("A table with this name already exists", 409),
+        r"Table.*does not exist": (error_msg, 404),
+        r"Table.*already exists": (error_msg, 409),
         # Query errors
-        r"syntax error in SQL": ("Invalid SQL query syntax", 400),
-        r"Invalid input syntax": ("Invalid input data format", 400),
+        r"syntax error": (error_msg, 400),
+        r"Catalog Error": (error_msg, 404), 
+        r"Binder Error": (error_msg, 400),
+        r"Invalid input syntax": (error_msg, 400),
+        
         # File errors
-        r"No such file": ("File not found", 404),
+        r"No such file": (error_msg, 404),
         r"Permission denied": ("Access denied", 403),
 
         # Data loader errors
-        r"Entity ID": ("Entity ID not found, please check the data loader parameters", 500),
+        r"Entity ID": (error_msg, 500),
         r"session_id": ("session_id not found, please refresh the page", 500),
     }
     
@@ -790,6 +793,70 @@ def data_loader_ingest_data():
                 "message": "Successfully ingested data from data loader"
             })
 
+    except Exception as e:
+        logger.error(f"Error ingesting data from data loader: {str(e)}")
+        safe_msg, status_code = sanitize_db_error_message(e)
+        return jsonify({
+            "status": "error", 
+            "message": safe_msg
+        }), status_code
+    
+
+@tables_bp.route('/data-loader/view-query-sample', methods=['POST'])
+def data_loader_view_query_sample():
+    """View a sample of data from a query"""
+
+    try:
+        data = request.get_json()
+        data_loader_type = data.get('data_loader_type')
+        data_loader_params = data.get('data_loader_params')
+        query = data.get('query')
+
+        if data_loader_type not in DATA_LOADERS:
+            return jsonify({"status": "error", "message": f"Invalid data loader type. Must be one of: {', '.join(DATA_LOADERS.keys())}"}), 400
+        
+        with db_manager.connection(session['session_id']) as duck_db_conn:
+            data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn)
+            sample = data_loader.view_query_sample(query)
+
+            return jsonify({
+                "status": "success",
+                "sample": sample,
+                "message": "Successfully retrieved query sample"
+            })
+    except Exception as e:
+        logger.error(f"Error viewing query sample: {str(e)}")
+        safe_msg, status_code = sanitize_db_error_message(e)
+        return jsonify({
+            "status": "error", 
+            "sample": [],
+            "message": safe_msg
+        }), status_code
+    
+
+@tables_bp.route('/data-loader/ingest-data-from-query', methods=['POST'])
+def data_loader_ingest_data_from_query():
+    """Ingest data from a data loader"""
+
+    try:
+        data = request.get_json()
+        data_loader_type = data.get('data_loader_type')
+        data_loader_params = data.get('data_loader_params')
+        query = data.get('query')
+        name_as = data.get('name_as')
+
+        if data_loader_type not in DATA_LOADERS:
+            return jsonify({"status": "error", "message": f"Invalid data loader type. Must be one of: {', '.join(DATA_LOADERS.keys())}"}), 400
+
+        with db_manager.connection(session['session_id']) as duck_db_conn:
+            data_loader = DATA_LOADERS[data_loader_type](data_loader_params, duck_db_conn)
+            data_loader.ingest_data_from_query(query, name_as)
+
+            return jsonify({
+                "status": "success",
+                "message": "Successfully ingested data from data loader"
+            })
+
     except Exception as e:
         logger.error(f"Error ingesting data from data loader: {str(e)}")
         safe_msg, status_code = sanitize_db_error_message(e)
diff --git a/src/app/utils.tsx b/src/app/utils.tsx
@@ -69,6 +69,10 @@ export function getUrls() {
         DATA_LOADER_LIST_DATA_LOADERS: `/api/tables/data-loader/list-data-loaders`,
         DATA_LOADER_LIST_TABLES: `/api/tables/data-loader/list-tables`,
         DATA_LOADER_INGEST_DATA: `/api/tables/data-loader/ingest-data`,
+        DATA_LOADER_VIEW_QUERY_SAMPLE: `/api/tables/data-loader/view-query-sample`,
+        DATA_LOADER_INGEST_DATA_FROM_QUERY: `/api/tables/data-loader/ingest-data-from-query`,
+
+        QUERY_COMPLETION: `/api/agent/query-completion`,
     };
 }
 
diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx
diff --git a/src/views/ModelSelectionDialog.tsx b/src/views/ModelSelectionDialog.tsx