Skip to content

Commit 8973435

Browse files
authored
[deploy] Merge pull request #171 from microsoft/dev
[deploy] 0.2.4 - Dev: performance update & data loader improvement
2 parents 612dc33 + b895f39 commit 8973435

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+2514
-1567
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@ Any questions? Ask on the Discord channel! [![Discord](https://img.shields.io/ba
2626

2727
## News 🔥🔥🔥
2828

29-
- [05-13-2025] Data Formulator 0.2.3: External Data Loader
29+
- [05-13-2025] Data Formulator 0.2.3 / 0.2.4: External Data Loader
3030
- We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155)
3131
- Current data loaders: MySQL, Azure Data Explorer (Kusto), Azure Blob and Amazon S3 (json, parquet, csv).
32+
- [07-01-2025] Updated with: Postgresql, mssql.
3233
- Call for action [link](https://github.com/microsoft/data-formulator/issues/156):
3334
- Users: let us know which data source you'd like to load data from.
3435
- Developers: let's build more data loaders.

local_server.bat

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,5 @@
66
:: set http_proxy=http://127.0.0.1:7890
77
:: set https_proxy=http://127.0.0.1:7890
88

9-
set FLASK_APP=py-src/data_formulator/app.py
109
set FLASK_RUN_PORT=5000
11-
set FLASK_RUN_HOST=0.0.0.0
12-
flask run
10+
python -m py-src.data_formulator.app --port %FLASK_RUN_PORT% --dev

local_server.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,6 @@
55
# export http_proxy=http://127.0.0.1:7890
66
# export https_proxy=http://127.0.0.1:7890
77

8-
env FLASK_APP=py-src/data_formulator/app.py FLASK_RUN_PORT=5000 FLASK_RUN_HOST=0.0.0.0 flask run
8+
#env FLASK_APP=py-src/data_formulator/app.py FLASK_RUN_PORT=5000 FLASK_RUN_HOST=0.0.0.0 flask run
9+
export FLASK_RUN_PORT=5000
10+
python -m py-src.data_formulator.app --port ${FLASK_RUN_PORT} --dev

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
"@emotion/react": "^11.14.0",
88
"@emotion/styled": "^11.14.0",
99
"@fontsource/roboto": "^4.5.5",
10-
"@mui/icons-material": "^5.14.0",
11-
"@mui/material": "^7.0.2",
10+
"@mui/icons-material": "^7.1.1",
11+
"@mui/material": "^7.1.1",
1212
"@reduxjs/toolkit": "^1.8.6",
1313
"@types/dompurify": "^3.0.5",
1414
"@types/validator": "^13.12.2",

py-src/data_formulator/agent_routes.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,8 +425,6 @@ def request_code_expl():
425425
if request.is_json:
426426
logger.info("# request data: ")
427427
content = request.get_json()
428-
token = content["token"]
429-
430428
client = get_client(content['model'])
431429

432430
# each table is a dict with {"name": xxx, "rows": [...]}

py-src/data_formulator/app.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,17 @@
2929
from dotenv import load_dotenv
3030
import secrets
3131
import base64
32-
APP_ROOT = Path(os.path.join(Path(__file__).parent)).absolute()
32+
APP_ROOT = Path(Path(__file__).parent).absolute()
3333

3434
import os
3535

3636
# blueprints
3737
from data_formulator.tables_routes import tables_bp
3838
from data_formulator.agent_routes import agent_bp
39+
from data_formulator.sse_routes import sse_bp
3940

41+
import queue
42+
from typing import Dict, Any
4043

4144
app = Flask(__name__, static_url_path='', static_folder=os.path.join(APP_ROOT, "dist"))
4245
app.secret_key = secrets.token_hex(16) # Generate a random secret key for sessions
@@ -65,6 +68,7 @@ def default(self, obj):
6568
# register blueprints
6669
app.register_blueprint(tables_bp)
6770
app.register_blueprint(agent_bp)
71+
app.register_blueprint(sse_bp)
6872

6973
print(APP_ROOT)
7074

@@ -252,6 +256,8 @@ def parse_args() -> argparse.Namespace:
252256
help="Whether to execute python in subprocess, it makes the app more secure (reducing the chance for the model to access the local machine), but increases the time of response")
253257
parser.add_argument("-d", "--disable-display-keys", action='store_true', default=False,
254258
help="Whether disable displaying keys in the frontend UI, recommended to turn on if you host the app not just for yourself.")
259+
parser.add_argument("--dev", action='store_true', default=False,
260+
help="Launch the app in development mode (prevents the app from opening the browser automatically)")
255261
return parser.parse_args()
256262

257263

@@ -264,11 +270,14 @@ def run_app():
264270
'disable_display_keys': args.disable_display_keys
265271
}
266272

267-
url = "http://localhost:{0}".format(args.port)
268-
threading.Timer(2, lambda: webbrowser.open(url, new=2)).start()
273+
if not args.dev:
274+
url = "http://localhost:{0}".format(args.port)
275+
threading.Timer(2, lambda: webbrowser.open(url, new=2)).start()
276+
277+
# Enable debug mode and auto-reload in development mode
278+
debug_mode = args.dev
279+
app.run(host='0.0.0.0', port=args.port, threaded=True, debug=debug_mode, use_reloader=debug_mode)
269280

270-
app.run(host='0.0.0.0', port=args.port, threaded=True)
271-
272281
if __name__ == '__main__':
273282
#app.run(debug=True, host='127.0.0.1', port=5000)
274283
#use 0.0.0.0 for public
Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,19 @@
11
from data_formulator.data_loader.external_data_loader import ExternalDataLoader
22
from data_formulator.data_loader.mysql_data_loader import MySQLDataLoader
3+
from data_formulator.data_loader.mssql_data_loader import MSSQLDataLoader
34
from data_formulator.data_loader.kusto_data_loader import KustoDataLoader
45
from data_formulator.data_loader.s3_data_loader import S3DataLoader
56
from data_formulator.data_loader.azure_blob_data_loader import AzureBlobDataLoader
7+
from data_formulator.data_loader.postgresql_data_loader import PostgreSQLDataLoader
68

79
DATA_LOADERS = {
810
"mysql": MySQLDataLoader,
11+
"mssql": MSSQLDataLoader,
912
"kusto": KustoDataLoader,
1013
"s3": S3DataLoader,
1114
"azure_blob": AzureBlobDataLoader,
15+
"postgresql": PostgreSQLDataLoader
1216
}
1317

14-
__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader", "DATA_LOADERS"]
18+
__all__ = ["ExternalDataLoader", "MySQLDataLoader", "MSSQLDataLoader", "KustoDataLoader", "S3DataLoader", "AzureBlobDataLoader","PostgreSQLDataLoader","DATA_LOADERS"]
19+

py-src/data_formulator/data_loader/azure_blob_data_loader.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def _setup_azure_authentication(self):
116116
)
117117
""")
118118

119-
def list_tables(self) -> List[Dict[str, Any]]:
119+
def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]:
120120
# Use Azure SDK to list blobs in the container
121121
from azure.storage.blob import BlobServiceClient
122122

@@ -145,8 +145,7 @@ def list_tables(self) -> List[Dict[str, Any]]:
145145
container_client = blob_service_client.get_container_client(self.container_name)
146146

147147
# List blobs in the container
148-
blob_list = container_client.list_blobs()
149-
148+
blob_list = container_client.list_blobs()
150149
results = []
151150

152151
for blob in blob_list:
@@ -156,6 +155,10 @@ def list_tables(self) -> List[Dict[str, Any]]:
156155
if blob_name.endswith('/') or not self._is_supported_file(blob_name):
157156
continue
158157

158+
# Apply table filter if provided
159+
if table_filter and table_filter.lower() not in blob_name.lower():
160+
continue
161+
159162
# Create Azure blob URL
160163
azure_url = f"az://{self.account_name}.{self.endpoint}/{self.container_name}/{blob_name}"
161164

py-src/data_formulator/data_loader/external_data_loader.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,18 @@ def sanitize_table_name(name_as: str) -> str:
4444
class ExternalDataLoader(ABC):
4545

4646
def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str):
47+
# Log DataFrame info before ingestion
48+
import logging
49+
logger = logging.getLogger(__name__)
50+
logger.info(f"Ingesting DataFrame to DuckDB table '{table_name}'")
51+
logger.info(f"DataFrame shape: {df.shape}")
52+
logger.info(f"DataFrame dtypes: {dict(df.dtypes)}")
53+
54+
# Log sample of datetime columns
55+
for col in df.columns:
56+
if pd.api.types.is_datetime64_any_dtype(df[col]):
57+
sample_values = df[col].dropna().head(3)
58+
logger.info(f"Datetime column '{col}' sample values: {list(sample_values)}")
4759

4860
base_name = table_name
4961
counter = 1
@@ -59,8 +71,19 @@ def ingest_df_to_duckdb(self, df: pd.DataFrame, table_name: str):
5971
# Create table
6072
random_suffix = ''.join(random.choices(string.ascii_letters + string.digits, k=6))
6173
self.duck_db_conn.register(f'df_temp_{random_suffix}', df)
74+
75+
# Log table schema after registration
76+
try:
77+
schema_info = self.duck_db_conn.execute(f"DESCRIBE df_temp_{random_suffix}").fetchall()
78+
logger.info(f"DuckDB table schema: {schema_info}")
79+
except Exception as e:
80+
logger.warning(f"Could not get schema info: {e}")
81+
6282
self.duck_db_conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df_temp_{random_suffix}")
6383
self.duck_db_conn.execute(f"DROP VIEW df_temp_{random_suffix}") # Drop the temporary view after creating the table
84+
85+
logger.info(f"Successfully created DuckDB table '{table_name}'")
86+
6487

6588
@staticmethod
6689
@abstractmethod
@@ -69,15 +92,14 @@ def list_params() -> List[Dict[str, Any]]:
6992

7093
@staticmethod
7194
@abstractmethod
72-
def auth_instructions() -> str:
73-
pass
95+
def auth_instructions() -> str: pass
7496

7597
@abstractmethod
7698
def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnection):
7799
pass
78100

79101
@abstractmethod
80-
def list_tables(self) -> List[Dict[str, Any]]:
102+
def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]:
81103
# should include: table_name, column_names, column_types, sample_data
82104
pass
83105

py-src/data_formulator/data_loader/kusto_data_loader.py

Lines changed: 93 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,27 @@
1+
import logging
2+
import sys
13
from typing import Dict, Any, List
24
import pandas as pd
35
import json
46
import duckdb
57
import random
68
import string
9+
from datetime import datetime
710

811
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
912
from azure.kusto.data.helpers import dataframe_from_result_table
1013

1114
from data_formulator.data_loader.external_data_loader import ExternalDataLoader, sanitize_table_name
1215

16+
# Configure root logger for general application logging
17+
logging.basicConfig(
18+
level=logging.INFO,
19+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
20+
handlers=[logging.StreamHandler(sys.stdout)]
21+
)
22+
23+
# Get logger for this module
24+
logger = logging.getLogger(__name__)
1325

1426
class KustoDataLoader(ExternalDataLoader):
1527

@@ -67,23 +79,93 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti
6779
self.kusto_cluster, self.client_id, self.client_secret, self.tenant_id))
6880
else:
6981
# This function provides an interface to Kusto. It uses Azure CLI auth, but you can also use other auth types.
70-
self.client = KustoClient(KustoConnectionStringBuilder.with_az_cli_authentication(self.kusto_cluster))
82+
cluster_url = KustoConnectionStringBuilder.with_az_cli_authentication(self.kusto_cluster)
83+
logger.info(f"Connecting to Kusto cluster: {self.kusto_cluster}")
84+
self.client = KustoClient(cluster_url)
85+
logger.info("Using Azure CLI authentication for Kusto client. Ensure you have run `az login` in your terminal.")
7186
except Exception as e:
72-
raise Exception(f"Error creating Kusto client: {e}, please authenticate with Azure CLI when starting the app.")
73-
87+
logger.error(f"Error creating Kusto client: {e}")
88+
raise Exception(f"Error creating Kusto client: {e}, please authenticate with Azure CLI when starting the app.")
7489
self.duck_db_conn = duck_db_conn
7590

91+
def _convert_kusto_datetime_columns(self, df: pd.DataFrame) -> pd.DataFrame:
92+
"""Convert Kusto datetime columns to proper pandas datetime format"""
93+
logger.info(f"Processing DataFrame with columns: {list(df.columns)}")
94+
logger.info(f"Column dtypes before conversion: {dict(df.dtypes)}")
95+
96+
for col in df.columns:
97+
original_dtype = df[col].dtype
98+
99+
if df[col].dtype == 'object':
100+
# Try to identify datetime columns by checking sample values
101+
sample_values = df[col].dropna().head(3)
102+
if len(sample_values) > 0:
103+
# Check if values look like datetime strings or timestamp numbers
104+
first_val = sample_values.iloc[0]
105+
106+
# Handle Kusto datetime format (ISO 8601 strings)
107+
if isinstance(first_val, str) and ('T' in first_val or '-' in first_val):
108+
try:
109+
# Try to parse as datetime
110+
pd.to_datetime(sample_values.iloc[0])
111+
logger.info(f"Converting column '{col}' from string to datetime")
112+
df[col] = pd.to_datetime(df[col], errors='coerce', utc=True).dt.tz_localize(None)
113+
except Exception as e:
114+
logger.debug(f"Failed to convert column '{col}' as string datetime: {e}")
115+
116+
# Handle numeric timestamps (Unix timestamps in various formats)
117+
elif isinstance(first_val, (int, float)) and first_val > 1000000000:
118+
try:
119+
# Try different timestamp formats
120+
if first_val > 1e15: # Likely microseconds since epoch
121+
logger.info(f"Converting column '{col}' from microseconds timestamp to datetime")
122+
df[col] = pd.to_datetime(df[col], unit='us', errors='coerce', utc=True).dt.tz_localize(None)
123+
elif first_val > 1e12: # Likely milliseconds since epoch
124+
logger.info(f"Converting column '{col}' from milliseconds timestamp to datetime")
125+
df[col] = pd.to_datetime(df[col], unit='ms', errors='coerce', utc=True).dt.tz_localize(None)
126+
else: # Likely seconds since epoch
127+
logger.info(f"Converting column '{col}' from seconds timestamp to datetime")
128+
df[col] = pd.to_datetime(df[col], unit='s', errors='coerce', utc=True).dt.tz_localize(None)
129+
except Exception as e:
130+
logger.debug(f"Failed to convert column '{col}' as numeric timestamp: {e}")
131+
132+
# Handle datetime64 columns that might have timezone info
133+
elif pd.api.types.is_datetime64_any_dtype(df[col]):
134+
# Ensure timezone-aware datetimes are properly handled
135+
if hasattr(df[col].dt, 'tz') and df[col].dt.tz is not None:
136+
logger.info(f"Converting timezone-aware datetime column '{col}' to UTC")
137+
df[col] = df[col].dt.tz_convert('UTC').dt.tz_localize(None)
138+
139+
# Log if conversion happened
140+
if original_dtype != df[col].dtype:
141+
logger.info(f"Column '{col}' converted from {original_dtype} to {df[col].dtype}")
142+
143+
logger.info(f"Column dtypes after conversion: {dict(df.dtypes)}")
144+
return df
145+
76146
def query(self, kql: str) -> pd.DataFrame:
147+
logger.info(f"Executing KQL query: {kql} on database {self.kusto_database}")
77148
result = self.client.execute(self.kusto_database, kql)
78-
return dataframe_from_result_table(result.primary_results[0])
149+
logger.info(f"Query executed successfully, returning results.")
150+
df = dataframe_from_result_table(result.primary_results[0])
151+
152+
# Convert datetime columns properly
153+
df = self._convert_kusto_datetime_columns(df)
154+
155+
return df
79156

80-
def list_tables(self) -> List[Dict[str, Any]]:
157+
def list_tables(self, table_filter: str = None) -> List[Dict[str, Any]]:
81158
query = ".show tables"
82159
tables_df = self.query(query)
83160

84161
tables = []
85162
for table in tables_df.to_dict(orient="records"):
86163
table_name = table['TableName']
164+
165+
# Apply table filter if provided
166+
if table_filter and table_filter.lower() not in table_name.lower():
167+
continue
168+
87169
schema_result = self.query(f".show table ['{table_name}'] schema as json").to_dict(orient="records")
88170
columns = [{
89171
'name': r["Name"],
@@ -94,7 +176,10 @@ def list_tables(self) -> List[Dict[str, Any]]:
94176
row_count = row_count_result[0]["TotalRowCount"]
95177

96178
sample_query = f"['{table_name}'] | take {5}"
97-
sample_result = json.loads(self.query(sample_query).to_json(orient="records"))
179+
sample_df = self.query(sample_query)
180+
181+
# Convert sample data to JSON with proper datetime handling
182+
sample_result = json.loads(sample_df.to_json(orient="records", date_format='iso'))
98183

99184
table_metadata = {
100185
"row_count": row_count,
@@ -159,7 +244,8 @@ def ingest_data(self, table_name: str, name_as: str = None, size: int = 5000000)
159244
total_rows_ingested += len(chunk_df)
160245

161246
def view_query_sample(self, query: str) -> str:
162-
return json.loads(self.query(query).head(10).to_json(orient="records"))
247+
df = self.query(query).head(10)
248+
return json.loads(df.to_json(orient="records", date_format='iso'))
163249

164250
def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
165251
# Sanitize the table name for SQL compatibility

0 commit comments

Comments
 (0)