experiment new data loading protocal

Chenglong-MS · Chenglong-MS · commit 94595b390b7d · 2025-04-30T17:53:24.000-07:00
diff --git a/.env.template b/.env.template
@@ -5,16 +5,4 @@
 DISABLE_DISPLAY_KEYS=false # if true, the display keys will not be shown in the frontend
 EXEC_PYTHON_IN_SUBPROCESS=false # if true, the python code will be executed in a subprocess to avoid crashing the main app, but it will increase the time of response
 
-LOCAL_DB_DIR= # the directory to store the local database, if not provided, the app will use the temp directory
-
-# External atabase connection settings
-# check https://duckdb.org/docs/stable/extensions/mysql.html 
-# and https://duckdb.org/docs/stable/extensions/postgres.html
-USE_EXTERNAL_DB=false # if true, the app will use an external database instead of the one in the app
-DB_NAME=mysql_db # the name to refer to this database connection
-DB_TYPE=mysql # mysql or postgresql
-DB_HOST=localhost 
-DB_PORT=0 
-DB_DATABASE=mysql 
-DB_USER=root 
-DB_PASSWORD=
+LOCAL_DB_DIR= # the directory to store the local database, if not provided, the app will use the temp directory
diff --git a/py-src/data_formulator/app.py b/py-src/data_formulator/app.py
@@ -37,6 +37,7 @@
 from data_formulator.tables_routes import tables_bp
 from data_formulator.agent_routes import agent_bp
 
+
 app = Flask(__name__, static_url_path='', static_folder=os.path.join(APP_ROOT, "dist"))
 app.secret_key = secrets.token_hex(16)  # Generate a random secret key for sessions
 
diff --git a/py-src/data_formulator/data_loader/external_data_loader.py b/py-src/data_formulator/data_loader/external_data_loader.py
@@ -0,0 +1,51 @@
+from abc import ABC, abstractmethod
+from typing import Dict, Any, List
+import pandas as pd
+import json
+import duckdb
+import random
+import string
+
+class ExternalDataLoader(ABC):
+    
+    def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str):
+
+        base_name = table_name
+        counter = 1
+        while True:
+            # Check if table exists
+            exists = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM duckdb_tables() WHERE table_name = '{table_name}'").fetchone()[0] > 0
+            if not exists:
+                break
+            # If exists, append counter to base name
+            table_name = f"{base_name}_{counter}"
+            counter += 1
+    
+        # Create table
+        random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
+        self.duck_db_conn.register(f'df_temp_{random_suffix}', df)
+        self.duck_db_conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df_temp_{random_suffix}")
+        self.duck_db_conn.execute(f"DROP VIEW df_temp_{random_suffix}")  # Drop the temporary view after creating the table
+    
+    @staticmethod
+    @abstractmethod
+    def list_params() -> List[Dict[str, Any]]:
+        pass
+
+    @abstractmethod
+    def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
+        pass
+
+    @abstractmethod
+    def list_tables(self) -> List[Dict[str, Any]]:
+        # should include: table_name, column_names, column_types, sample_data
+        pass
+
+    @abstractmethod
+    def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000):
+        pass
+
+    @abstractmethod
+    def ingest_data_from_query(self, query: str, name_as: str):
+        pass
+
diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py
@@ -0,0 +1,140 @@
+from typing import Dict, Any, List
+import pandas as pd
+import json
+import duckdb
+import random
+import string
+
+from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
+from azure.kusto.data.helpers import dataframe_from_result_table
+
+from data_formulator.data_loader.external_data_loader import ExternalDataLoader
+
+def sanitize_table_name(table_name: str) -> str:
+    return table_name.replace(".", "_").replace("-", "_")
+
+class KustoDataLoader(ExternalDataLoader):
+
+    @staticmethod
+    def list_params() -> bool:
+        params_list = [
+            {"name": "kusto_cluster", "type": "string", "required": True}, 
+            {"name": "kusto_database", "type": "string", "required": True}, 
+            {"name": "client_id", "type": "string", "required": False}, 
+            {"name": "client_secret", "type": "string", "required": False}, 
+            {"name": "tenant_id", "type": "string", "required": False}
+        ]
+        return params_list
+
+    def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
+
+        self.kusto_cluster = params.get("kusto_cluster", None)
+        self.kusto_database = params.get("kusto_database", None)
+        
+        self.client_id = params.get("client_id", None)
+        self.client_secret = params.get("client_secret", None)
+        self.tenant_id = params.get("tenant_id", None)
+
+        try:
+            if self.client_id and self.client_secret and self.tenant_id:
+                # This function provides an interface to Kusto. It uses AAD application key authentication.
+                self.client = KustoClient(KustoConnectionStringBuilder.with_aad_application_key_authentication(
+                    self.kusto_cluster, self.client_id, self.client_secret, self.tenant_id))
+            else:
+                # This function provides an interface to Kusto. It uses Azure CLI auth, but you can also use other auth types.
+                self.client = KustoClient(KustoConnectionStringBuilder.with_az_cli_authentication(self.kusto_cluster))
+        except Exception as e:
+            raise Exception(f"Error creating Kusto client: {e}, please authenticate with Azure CLI when starting the app.")
+        
+        self.duck_db_conn = duck_db_conn
+
+    def query(self, kql: str) -> pd.DataFrame:
+        result = self.client.execute(self.kusto_database, kql)
+        return dataframe_from_result_table(result.primary_results[0])
+
+    def list_tables(self) -> List[Dict[str, Any]]:
+        query = ".show tables"
+        tables_df = self.query(query)
+
+        results = []
+        for table in tables_df.to_dict(orient="records"):
+            table_name = table['TableName']
+            schema_result = self.query(f".show table ['{table_name}'] schema as json").to_dict(orient="records")
+            columns = [{
+                'name': r["Name"],
+                'type': r["Type"]
+            } for r in json.loads(schema_result[0]['Schema'])['OrderedColumns']]
+
+            row_count_result = self.query(f".show table ['{table_name}'] details").to_dict(orient="records")
+            row_count = row_count_result[0]["TotalRowCount"]
+
+            sample_query = f"['{table_name}'] | take {10}"
+            sample_result = self.query(sample_query).to_dict(orient="records")
+
+            table_metadata = {
+                "row_count": row_count,
+                "columns": columns,
+                "sample_rows": sample_result
+            }
+
+            results.append({
+                "name": table_name,
+                "metadata": table_metadata
+            })
+
+        return results
+    
+    def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000) -> pd.DataFrame:
+        if name_as is None:
+            name_as = table_name
+        
+        # Create a subquery that applies random ordering once with a fixed seed
+        total_rows_ingested = 0
+        first_chunk = True
+        chunk_size = 100000
+
+        size_estimate_query = f"['{table_name}'] | take {10000} | summarize Total=sum(estimate_data_size(*))"
+        size_estimate_result = self.query(size_estimate_query)
+        size_estimate = size_estimate_result['Total'].values[0]
+        print(f"size_estimate: {size_estimate}")
+
+        chunk_size = min(64 * 1024 * 1024 / size_estimate * 0.9 * 10000, 5000000)
+        print(f"estimated_chunk_size: {chunk_size}")
+
+        while total_rows_ingested < size:
+            try:
+                query = f"['{table_name}'] | serialize | extend rn=row_number() | where rn >= {total_rows_ingested} and rn < {total_rows_ingested + chunk_size} | project-away rn"
+                chunk_df = self.query(query)
+            except Exception as e:
+                chunk_size = int(chunk_size * 0.8)
+                continue
+
+            print(f"total_rows_ingested: {total_rows_ingested}")
+            print(chunk_df.head())
+            
+            # Stop if no more data
+            if chunk_df.empty:
+                break
+
+             # Sanitize the table name for SQL compatibility
+            name_as = sanitize_table_name(name_as)
+            
+            # For first chunk, create new table; for subsequent chunks, append
+            if first_chunk:
+                self.ingest_df_to_duckdb(chunk_df, name_as)
+                first_chunk = False
+            else:
+                # Append to existing table
+                random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
+                self.duck_db_conn.register(f'df_temp_{random_suffix}', chunk_df)
+                self.duck_db_conn.execute(f"INSERT INTO {name_as} SELECT * FROM df_temp_{random_suffix}")
+                self.duck_db_conn.execute(f"DROP VIEW df_temp_{random_suffix}")
+            
+            total_rows_ingested += len(chunk_df)
+
+
+    def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
+        # Sanitize the table name for SQL compatibility
+        name_as = sanitize_table_name(name_as)
+        df = self.query(query)
+        self.ingest_df_to_duckdb(df, name_as)
diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py
@@ -0,0 +1,91 @@
+import json
+
+import pandas as pd
+import duckdb
+
+from data_formulator.data_loader.external_data_loader import ExternalDataLoader
+from typing import Dict, Any
+
+class MySQLDataLoader(ExternalDataLoader):
+
+    @staticmethod
+    def list_params() -> bool:
+        params_list = [
+            {"name": "user", "type": "string", "required": True, "default": "root"}, 
+            {"name": "password", "type": "string", "required": False, "default": ""}, 
+            {"name": "host", "type": "string", "required": True, "default": "localhost"}, 
+            {"name": "database", "type": "string", "required": True, "default": "mysql"}
+        ]
+        return params_list
+
+    def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
+        self.params = params
+        self.duck_db_conn = duck_db_conn
+        
+        # Install and load the MySQL extension
+        self.duck_db_conn.install_extension("mysql")
+        self.duck_db_conn.load_extension("mysql")
+        
+        attatch_string = ""
+        for key, value in self.params.items():
+            if value:
+                attatch_string += f"{key}={value} "
+
+        # Register MySQL connection
+        self.duck_db_conn.execute(f"ATTACH '{attatch_string}' AS mysqldb (TYPE mysql);")
+
+    def list_tables(self):
+        tables_df = self.duck_db_conn.execute(f"""
+            SELECT TABLE_SCHEMA, TABLE_NAME FROM mysqldb.information_schema.tables 
+            WHERE table_schema NOT IN ('information_schema', 'mysql', 'performance_schema', 'sys')
+        """).fetch_df()
+
+        results = []
+        
+        for schema, table_name in tables_df.values:
+
+            full_table_name = f"{schema}.{table_name}"
+
+            # Get column information using DuckDB's information schema
+            columns_df = self.duck_db_conn.execute(f"DESCRIBE mysqldb.{full_table_name}").df()
+            columns = [{
+                'name': row['column_name'],
+                'type': row['column_type']
+            } for _, row in columns_df.iterrows()]
+            
+            # Get sample data
+            sample_df = self.duck_db_conn.execute(f"SELECT * FROM mysqldb.{full_table_name} LIMIT 10").df()
+            sample_rows = json.loads(sample_df.to_json(orient="records"))
+            
+            # get row count
+            row_count = self.duck_db_conn.execute(f"SELECT COUNT(*) FROM mysqldb.{full_table_name}").fetchone()[0]
+
+            table_metadata = {
+                "row_count": row_count,
+                "columns": columns,
+                "sample_rows": sample_rows
+            }
+            
+            results.append({
+                "name": full_table_name,
+                "metadata": table_metadata
+            })
+            
+        return results
+
+    def ingest_data(self, table_name: str, name_as: str = None, size: int = 1000000):
+        # Create table in the main DuckDB database from MySQL data
+        if name_as is None:
+            name_as = table_name.split('.')[-1]
+
+        self.duck_db_conn.execute(f"""
+            CREATE OR REPLACE TABLE {name_as} AS 
+            SELECT * FROM mysqldb.{table_name} 
+            LIMIT {size}
+        """)
+
+    def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
+        self.duck_db_conn.execute(f"""
+            CREATE OR REPLACE TABLE main.{name_as} AS 
+            SELECT * FROM ({query})
+        """)
diff --git a/py-src/data_formulator/db_manager.py b/py-src/data_formulator/db_manager.py
@@ -9,13 +9,9 @@
 from dotenv import load_dotenv
 
 class DuckDBManager:
-    def __init__(self, external_db_connections: Dict[str, Dict[str, Any]], local_db_dir: str):
+    def __init__(self, local_db_dir: str):
         # Store session db file paths
         self._db_files: Dict[str, str] = {}
-
-        # External db connections and tracking of installed extensions
-        self._external_db_connections: Dict[str, Dict[str, Any]] = external_db_connections
-        self._installed_extensions: Dict[str, List[str]] = {}
         self._local_db_dir: str = local_db_dir
 
     @contextmanager
@@ -26,7 +22,6 @@ def connection(self, session_id: str) -> ContextManager[duckdb.DuckDBPyConnectio
             conn = self.get_connection(session_id)
             yield conn
         finally:
-            # Close the connection after use
             if conn:
                 conn.close()
     
@@ -40,52 +35,18 @@ def get_connection(self, session_id: str) -> duckdb.DuckDBPyConnection:
             db_file = os.path.join(db_dir, f"df_{session_id}.duckdb")
             print(f"=== Creating new db file: {db_file}")
             self._db_files[session_id] = db_file
-            # Initialize extension tracking for this file
-            self._installed_extensions[db_file] = []
         else:
             print(f"=== Using existing db file: {self._db_files[session_id]}")
             db_file = self._db_files[session_id]
             
         # Create a fresh connection to the database file
         conn = duckdb.connect(database=db_file)
 
-        if self._external_db_connections and self._external_db_connections['db_type'] in ['mysql', 'postgresql']:
-            db_name = self._external_db_connections['db_name']
-            db_type = self._external_db_connections['db_type']
-            
-            print(f"=== connecting to {db_type} extension")
-            # Only install if not already installed for this db file
-            if db_type not in self._installed_extensions.get(db_file, []):
-                conn.execute(f"INSTALL {db_type};")
-                self._installed_extensions[db_file].append(db_type)
-                
-            conn.execute(f"LOAD {db_type};")
-            conn.execute(f"""CREATE SECRET (
-                TYPE {db_type}, 
-                HOST '{self._external_db_connections['host']}', 
-                PORT '{self._external_db_connections['port']}', 
-                DATABASE '{self._external_db_connections['database']}', 
-                USER '{self._external_db_connections['user']}', 
-                PASSWORD '{self._external_db_connections['password']}');
-            """)
-            conn.execute(f"ATTACH '' AS {db_name} (TYPE {db_type});")
-            # result = conn.execute(f"SELECT * FROM {db_name}.information_schema.tables WHERE table_schema NOT IN ('information_schema', 'mysql', 'performance_schema', 'sys');").fetch_df()
-            # print(f"=== result: {result}")
-                
         return conn
 
 env = load_dotenv()
 
 # Initialize the DB manager
 db_manager = DuckDBManager(
-    external_db_connections={
-        "db_name": os.getenv('DB_NAME'),
-        "db_type": os.getenv('DB_TYPE'),
-        "host": os.getenv('DB_HOST'),
-        "port": os.getenv('DB_PORT'),
-        "database": os.getenv('DB_DATABASE'),
-        "user": os.getenv('DB_USER'),
-        "password": os.getenv('DB_PASSWORD')
-    } if os.getenv('USE_EXTERNAL_DB') == 'true' else None,
     local_db_dir=os.getenv('LOCAL_DB_DIR')
 )
diff --git a/py-src/data_formulator/tables_routes.py b/py-src/data_formulator/tables_routes.py
diff --git a/src/app/utils.tsx b/src/app/utils.tsx
diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx