Merge remote-tracking branch 'origin/dev' into dev

Chenglong-MS · Chenglong-MS · commit 9460b3ea2782 · 2025-07-01T10:41:13.000-07:00
diff --git a/py-src/data_formulator/data_loader/azure_blob_data_loader.py b/py-src/data_formulator/data_loader/azure_blob_data_loader.py
@@ -116,7 +116,7 @@ def _setup_azure_authentication(self):
                 )
             """)
 
-    def list_tables(self) -> List[Dict[str, Any]]:
+    def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]:
         # Use Azure SDK to list blobs in the container
         from azure.storage.blob import BlobServiceClient
         
@@ -145,8 +145,7 @@ def list_tables(self) -> List[Dict[str, Any]]:
         container_client = blob_service_client.get_container_client(self.container_name)
         
         # List blobs in the container
-        blob_list = container_client.list_blobs()
-        
+        blob_list = container_client.list_blobs()        
         results = []
         
         for blob in blob_list:
@@ -156,6 +155,10 @@ def list_tables(self) -> List[Dict[str, Any]]:
             if blob_name.endswith('/') or not self._is_supported_file(blob_name):
                 continue
             
+            # Apply table filter if provided
+            if table_filter and table_filter.lower() not in blob_name.lower():
+                continue
+            
             # Create Azure blob URL
             azure_url = f"az://{self.account_name}.{self.endpoint}/{self.container_name}/{blob_name}"
             
diff --git a/py-src/data_formulator/data_loader/external_data_loader.py b/py-src/data_formulator/data_loader/external_data_loader.py
@@ -44,6 +44,18 @@ def sanitize_table_name(name_as: str) -> str:
 class ExternalDataLoader(ABC):
     
     def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str):
+        # Log DataFrame info before ingestion
+        import logging
+        logger = logging.getLogger(__name__)
+        logger.info(f"Ingesting DataFrame to DuckDB table '{table_name}'")
+        logger.info(f"DataFrame shape: {df.shape}")
+        logger.info(f"DataFrame dtypes: {dict(df.dtypes)}")
+        
+        # Log sample of datetime columns
+        for col in df.columns:
+            if pd.api.types.is_datetime64_any_dtype(df[col]):
+                sample_values = df[col].dropna().head(3)
+                logger.info(f"Datetime column '{col}' sample values: {list(sample_values)}")
 
         base_name = table_name
         counter = 1
@@ -59,8 +71,19 @@ def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str):
         # Create table
         random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
         self.duck_db_conn.register(f'df_temp_{random_suffix}', df)
+        
+        # Log table schema after registration
+        try:
+            schema_info = self.duck_db_conn.execute(f"DESCRIBE df_temp_{random_suffix}").fetchall()
+            logger.info(f"DuckDB table schema: {schema_info}")
+        except Exception as e:
+            logger.warning(f"Could not get schema info: {e}")
+        
         self.duck_db_conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df_temp_{random_suffix}")
         self.duck_db_conn.execute(f"DROP VIEW df_temp_{random_suffix}")  # Drop the temporary view after creating the table
+        
+        logger.info(f"Successfully created DuckDB table '{table_name}'")
+    
     
     @staticmethod
     @abstractmethod
@@ -69,15 +92,14 @@ def list_params() -> List[Dict[str, Any]]:
 
     @staticmethod
     @abstractmethod
-    def auth_instructions() -> str:
-        pass
+    def auth_instructions() -> str:        pass
 
     @abstractmethod
     def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
         pass
 
     @abstractmethod
-    def list_tables(self) -> List[Dict[str, Any]]:
+    def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]:
         # should include: table_name, column_names, column_types, sample_data
         pass
 
diff --git a/py-src/data_formulator/data_loader/kusto_data_loader.py b/py-src/data_formulator/data_loader/kusto_data_loader.py
@@ -1,15 +1,27 @@
+import logging
+import sys
 from typing import Dict, Any, List
 import pandas as pd
 import json
 import duckdb
 import random
 import string
+from datetime import datetime
 
 from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
 from azure.kusto.data.helpers import dataframe_from_result_table
 
 from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name
 
+# Configure root logger for general application logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
+
+# Get logger for this module
+logger = logging.getLogger(__name__)
 
 class KustoDataLoader(ExternalDataLoader):
 
@@ -67,23 +79,93 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti
                     self.kusto_cluster, self.client_id, self.client_secret, self.tenant_id))
             else:
                 # This function provides an interface to Kusto. It uses Azure CLI auth, but you can also use other auth types.
-                self.client = KustoClient(KustoConnectionStringBuilder.with_az_cli_authentication(self.kusto_cluster))
+                cluster_url = KustoConnectionStringBuilder.with_az_cli_authentication(self.kusto_cluster)
+                logger.info(f"Connecting to Kusto cluster: {self.kusto_cluster}")
+                self.client = KustoClient(cluster_url)
+                logger.info("Using Azure CLI authentication for Kusto client. Ensure you have run `az login` in your terminal.")
         except Exception as e:
-            raise Exception(f"Error creating Kusto client: {e}, please authenticate with Azure CLI when starting the app.")
-        
+            logger.error(f"Error creating Kusto client: {e}")
+            raise Exception(f"Error creating Kusto client: {e}, please authenticate with Azure CLI when starting the app.")        
         self.duck_db_conn = duck_db_conn
 
+    def _convert_kusto_datetime_columns(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Convert Kusto datetime columns to proper pandas datetime format"""
+        logger.info(f"Processing DataFrame with columns: {list(df.columns)}")
+        logger.info(f"Column dtypes before conversion: {dict(df.dtypes)}")
+        
+        for col in df.columns:
+            original_dtype = df[col].dtype
+            
+            if df[col].dtype == 'object':
+                # Try to identify datetime columns by checking sample values
+                sample_values = df[col].dropna().head(3)
+                if len(sample_values) > 0:
+                    # Check if values look like datetime strings or timestamp numbers
+                    first_val = sample_values.iloc[0]
+                    
+                    # Handle Kusto datetime format (ISO 8601 strings)
+                    if isinstance(first_val, str) and ('T' in first_val or '-' in first_val):
+                        try:
+                            # Try to parse as datetime
+                            pd.to_datetime(sample_values.iloc[0])
+                            logger.info(f"Converting column '{col}' from string to datetime")
+                            df[col] = pd.to_datetime(df[col], errors='coerce', utc=True).dt.tz_localize(None)
+                        except Exception as e:
+                            logger.debug(f"Failed to convert column '{col}' as string datetime: {e}")
+                    
+                    # Handle numeric timestamps (Unix timestamps in various formats)
+                    elif isinstance(first_val, (int, float)) and first_val > 1000000000:
+                        try:
+                            # Try different timestamp formats
+                            if first_val > 1e15:  # Likely microseconds since epoch
+                                logger.info(f"Converting column '{col}' from microseconds timestamp to datetime")
+                                df[col] = pd.to_datetime(df[col], unit='us', errors='coerce', utc=True).dt.tz_localize(None)
+                            elif first_val > 1e12:  # Likely milliseconds since epoch
+                                logger.info(f"Converting column '{col}' from milliseconds timestamp to datetime")
+                                df[col] = pd.to_datetime(df[col], unit='ms', errors='coerce', utc=True).dt.tz_localize(None)
+                            else:  # Likely seconds since epoch
+                                logger.info(f"Converting column '{col}' from seconds timestamp to datetime")
+                                df[col] = pd.to_datetime(df[col], unit='s', errors='coerce', utc=True).dt.tz_localize(None)
+                        except Exception as e:
+                            logger.debug(f"Failed to convert column '{col}' as numeric timestamp: {e}")
+                            
+            # Handle datetime64 columns that might have timezone info
+            elif pd.api.types.is_datetime64_any_dtype(df[col]):
+                # Ensure timezone-aware datetimes are properly handled
+                if hasattr(df[col].dt, 'tz') and df[col].dt.tz is not None:
+                    logger.info(f"Converting timezone-aware datetime column '{col}' to UTC")
+                    df[col] = df[col].dt.tz_convert('UTC').dt.tz_localize(None)
+            
+            # Log if conversion happened
+            if original_dtype != df[col].dtype:
+                logger.info(f"Column '{col}' converted from {original_dtype} to {df[col].dtype}")
+        
+        logger.info(f"Column dtypes after conversion: {dict(df.dtypes)}")
+        return df
+
     def query(self, kql: str) -> pd.DataFrame:
+        logger.info(f"Executing KQL query: {kql} on database {self.kusto_database}")
         result = self.client.execute(self.kusto_database, kql)
-        return dataframe_from_result_table(result.primary_results[0])
+        logger.info(f"Query executed successfully, returning results.")
+        df = dataframe_from_result_table(result.primary_results[0])
+        
+        # Convert datetime columns properly
+        df = self._convert_kusto_datetime_columns(df)
+        
+        return df
 
-    def list_tables(self) -> List[Dict[str, Any]]:
+    def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]:
         query = ".show tables"
         tables_df = self.query(query)
 
         tables = []
         for table in tables_df.to_dict(orient="records"):
             table_name = table['TableName']
+            
+            # Apply table filter if provided
+            if table_filter and table_filter.lower() not in table_name.lower():
+                continue
+                
             schema_result = self.query(f".show table ['{table_name}'] schema as json").to_dict(orient="records")
             columns = [{
                 'name': r["Name"],
@@ -94,7 +176,10 @@ def list_tables(self) -> List[Dict[str, Any]]:
             row_count = row_count_result[0]["TotalRowCount"]
 
             sample_query = f"['{table_name}'] | take {5}"
-            sample_result = json.loads(self.query(sample_query).to_json(orient="records"))
+            sample_df = self.query(sample_query)
+            
+            # Convert sample data to JSON with proper datetime handling
+            sample_result = json.loads(sample_df.to_json(orient="records", date_format='iso'))
 
             table_metadata = {
                 "row_count": row_count,
@@ -159,7 +244,8 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000)
             total_rows_ingested += len(chunk_df)
 
     def view_query_sample(self, query: str) -> str:
-        return json.loads(self.query(query).head(10).to_json(orient="records"))
+        df = self.query(query).head(10)
+        return json.loads(df.to_json(orient="records", date_format='iso'))
 
     def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
         # Sanitize the table name for SQL compatibility
diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py
@@ -61,12 +61,10 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti
         try:
             self.duck_db_conn.execute("DETACH mysqldb;")
         except:
-            pass  # Ignore if mysqldb doesn't exist
-
-        # Register MySQL connection
+            pass  # Ignore if mysqldb doesn't exist        # Register MySQL connection
         self.duck_db_conn.execute(f"ATTACH '{attatch_string}' AS mysqldb (TYPE mysql);")
 
-    def list_tables(self):
+    def list_tables(self, table_filter: str = None):
         tables_df = self.duck_db_conn.execute(f"""
             SELECT TABLE_SCHEMA, TABLE_NAME FROM mysqldb.information_schema.tables 
             WHERE table_schema NOT IN ('information_schema', 'mysql', 'performance_schema', 'sys')
@@ -78,6 +76,10 @@ def list_tables(self):
 
             full_table_name = f"mysqldb.{schema}.{table_name}"
 
+            # Apply table filter if provided
+            if table_filter and table_filter.lower() not in table_name.lower():
+                continue
+
             # Get column information using DuckDB's information schema
             columns_df = self.duck_db_conn.execute(f"DESCRIBE {full_table_name}").df()
             columns = [{
diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py
@@ -78,7 +78,7 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti
         if self.aws_session_token:  # Add this block
             self.duck_db_conn.execute(f"SET s3_session_token='{self.aws_session_token}'")
 
-    def list_tables(self) -> List[Dict[str, Any]]:
+    def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]:
         # Use boto3 to list objects in the bucket
         import boto3
         
@@ -103,6 +103,10 @@ def list_tables(self) -> List[Dict[str, Any]]:
                 if key.endswith('/') or not self._is_supported_file(key):
                     continue
                 
+                # Apply table filter if provided
+                if table_filter and table_filter.lower() not in key.lower():
+                    continue
+                
                 # Create S3 URL
                 s3_url = f"s3://{self.bucket}/{key}"
                 
diff --git a/py-src/data_formulator/tables_routes.py b/py-src/data_formulator/tables_routes.py
diff --git a/src/views/DBTableManager.tsx b/src/views/DBTableManager.tsx