microsoft
diff --git a/‎.env.template
Lines changed: 1 addition & 13 deletions b/‎.env.template
Lines changed: 1 addition & 13 deletions
diff --git a/‎README.md
Lines changed: 9 additions & 0 deletions b/‎README.md
Lines changed: 9 additions & 0 deletions
diff --git a/‎package.json
Lines changed: 3 additions & 3 deletions b/‎package.json
Lines changed: 3 additions & 3 deletions
diff --git a/‎py-src/data_formulator/agent_routes.py
Lines changed: 23 additions & 2 deletions b/‎py-src/data_formulator/agent_routes.py
Lines changed: 23 additions & 2 deletions
diff --git a/‎py-src/data_formulator/agents/agent_py_data_rec.py
Lines changed: 1 addition & 1 deletion b/‎py-src/data_formulator/agents/agent_py_data_rec.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎py-src/data_formulator/agents/agent_py_data_transform.py
Lines changed: 1 addition & 3 deletions b/‎py-src/data_formulator/agents/agent_py_data_transform.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎py-src/data_formulator/agents/agent_query_completion.py
Lines changed: 80 additions & 0 deletions b/‎py-src/data_formulator/agents/agent_query_completion.py
Lines changed: 80 additions & 0 deletions
diff --git a/‎py-src/data_formulator/app.py
Lines changed: 1 addition & 0 deletions b/‎py-src/data_formulator/app.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎py-src/data_formulator/data_loader/README.md
Lines changed: 36 additions & 0 deletions b/‎py-src/data_formulator/data_loader/README.md
Lines changed: 36 additions & 0 deletions
diff --git a/‎py-src/data_formulator/data_loader/__init__.py
Lines changed: 10 additions & 0 deletions b/‎py-src/data_formulator/data_loader/__init__.py
Lines changed: 10 additions & 0 deletions
@@ -5,16 +5,4 @@
 DISABLE_DISPLAY_KEYS=false # if true, the display keys will not be shown in the frontend
 EXEC_PYTHON_IN_SUBPROCESS=false # if true, the python code will be executed in a subprocess to avoid crashing the main app, but it will increase the time of response
 
-LOCAL_DB_DIR= # the directory to store the local database, if not provided, the app will use the temp directory
-
-# External atabase connection settings
-# check https://duckdb.org/docs/stable/extensions/mysql.html 
-# and https://duckdb.org/docs/stable/extensions/postgres.html
-USE_EXTERNAL_DB=false # if true, the app will use an external database instead of the one in the app
-DB_NAME=mysql_db # the name to refer to this database connection
-DB_TYPE=mysql # mysql or postgresql
-DB_HOST=localhost 
-DB_PORT=0 
-DB_DATABASE=mysql 
-DB_USER=root 
-DB_PASSWORD=
+LOCAL_DB_DIR= # the directory to store the local database, if not provided, the app will use the temp directory
@@ -8,6 +8,7 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)&ensp;
 [![YouTube](https://img.shields.io/badge/YouTube-white?logo=youtube&logoColor=%23FF0000)](https://youtu.be/3ndlwt0Wi3c)&ensp;
 [![build](https://github.com/microsoft/data-formulator/actions/workflows/python-build.yml/badge.svg)](https://github.com/microsoft/data-formulator/actions/workflows/python-build.yml)
+[![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb)
 
 </div>
 
@@ -22,6 +23,14 @@ Transform data and create rich visualizations iteratively with AI 🪄. Try Data
 
 ## News 🔥🔥🔥
 
+- [05-13-2025] Data Formulator 0.2.1: External Data Loader 
+  - We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155)
+    - Example data loaders from MySQL and Azure Data Explorer (Kusto) are provided.
+  - Call for action [link](https://github.com/microsoft/data-formulator/issues/156):
+    - Users: let us know which data source you'd like to load data from.
+    - Developers: let's build more data loaders.
+  - Discord channel for discussions: join us! [![Discord](https://img.shields.io/badge/discord-chat-green?logo=discord)](https://discord.gg/mYCZMQKYZb)
+
 - [04-23-2025] Data Formulator 0.2: working with *large* data 📦📦📦
   - Explore large data by:
     1. Upload large data file to the local database (powered by [DuckDB](https://github.com/duckdb/duckdb)).
 
@@ -4,11 +4,11 @@
     "version": "0.1.0",
     "private": true,
     "dependencies": {
-        "@emotion/react": "^11.9.0",
-        "@emotion/styled": "^11.8.1",
+        "@emotion/react": "^11.14.0",
+        "@emotion/styled": "^11.14.0",
         "@fontsource/roboto": "^4.5.5",
         "@mui/icons-material": "^5.14.0",
-        "@mui/material": "^5.6.0",
+        "@mui/material": "^7.0.2",
         "@reduxjs/toolkit": "^1.8.6",
         "@types/dompurify": "^3.0.5",
         "@types/validator": "^13.12.2",
 
@@ -29,7 +29,7 @@
 from data_formulator.agents.agent_data_load import DataLoadAgent
 from data_formulator.agents.agent_data_clean import DataCleanAgent
 from data_formulator.agents.agent_code_explanation import CodeExplanationAgent
-
+from data_formulator.agents.agent_query_completion import QueryCompletionAgent
 from data_formulator.agents.client_utils import Client
 
 from data_formulator.db_manager import db_manager
@@ -437,4 +437,25 @@ def request_code_expl():
         expl = code_expl_agent.run(input_tables, code)
     else:
         expl = ""
-    return expl
+    return expl
+
+@agent_bp.route('/query-completion', methods=['POST'])
+def query_completion():
+    if request.is_json:
+        logger.info("# request data: ")
+        content = request.get_json()        
+
+        client = get_client(content['model'])
+
+        data_source_metadata = content["data_source_metadata"]
+        query = content["query"]
+
+        
+        query_completion_agent = QueryCompletionAgent(client=client)
+        reasoning, query = query_completion_agent.run(data_source_metadata, query)
+        response = flask.jsonify({ "token": "", "status": "ok", "reasoning": reasoning, "query": query })
+    else:
+        response = flask.jsonify({ "token": "", "status": "error", "reasoning": "unable to complete query", "query": "" })
+
+    response.headers.add('Access-Control-Allow-Origin', '*')
+    return response
@@ -165,7 +165,7 @@ def process_gpt_response(self, input_tables, messages, response):
                     if result['status'] == 'ok':
                         result_df = result['content']
                         result['content'] = {
-                            'rows': result_df.to_dict(orient='records'),
+                            'rows': json.loads(result_df.to_json(orient='records')),
                         }
                     else:
                         logger.info(result['content'])
 
@@ -221,13 +221,11 @@ def process_gpt_response(self, input_tables, messages, response):
                     result = py_sandbox.run_transform_in_sandbox2020(code_str, [pd.DataFrame.from_records(t['rows']) for t in input_tables], self.exec_python_in_subprocess)
                     result['code'] = code_str
 
-                    print(f"result: {result}")
-
                     if result['status'] == 'ok':
                         # parse the content
                         result_df = result['content']
                         result['content'] = {
-                            'rows': result_df.to_dict(orient='records'),
+                            'rows': json.loads(result_df.to_json(orient='records')),
                         }
                     else:
                         logger.info(result['content'])
 
@@ -0,0 +1,80 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import pandas as pd
+import json
+
+from data_formulator.agents.agent_utils import extract_code_from_gpt_response, extract_json_objects
+import re
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+SYSTEM_PROMPT = '''You are a data scientist to help with data queries. 
+The user will provide you with a description of the data source and tables available in the [DATA SOURCE] section and a query in the [USER INPUTS] section. 
+You will need to help the user complete the query and provide reasoning for the query you generated in the [OUTPUT] section.
+
+Input format:
+* The data source description is a json object with the following fields:
+    * `data_source`: the name of the data source
+    * `tables`: a list of tables in the data source, which maps the table name to the list of columns available in the table.
+* The user input is a natural language description of the query or a partial query you need to complete.
+
+Steps:
+* Based on data source description and user input, you should first decide on what language should be used to query the data. 
+* Then, describe the logic for the query you generated in a json object in a block ```json``` with the following fields:
+    * `language`: the language of the query you generated
+    * `tables`: the names of the tables you will use in the query
+    * `logic`: the reasoning behind why you chose the tables and the logic for the query you generated
+* Finally, generate the complete query in the language specified in a code block ```{language}```.
+
+Output format:
+* The output should be in the following format, no other text should be included:
+
+[REASONING]
+```json
+{
+    "language": {language},
+    "tables": {tables},
+    "logic": {logic}
+}
+```
+
+[QUERY]
+```{language}   
+{query}
+```
+'''
+
+class QueryCompletionAgent(object):
+
+    def __init__(self, client):
+        self.client = client
+
+    def run(self, data_source_metadata, query):
+
+        user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n[REASONING]\n"
+
+        logger.info(user_query)
+
+        messages = [{"role":"system", "content": SYSTEM_PROMPT},
+                    {"role":"user","content": user_query}]
+        
+        ###### the part that calls open_ai
+        response = self.client.get_completion(messages = messages)
+        response_content = '[REASONING]\n' + response.choices[0].message.content
+        
+        logger.info(f"=== query completion output ===>\n{response_content}\n")
+
+        reasoning = extract_json_objects(response_content.split("[REASONING]")[1].split("[QUERY]")[0].strip())[0]
+        output_query = response_content.split("[QUERY]")[1].strip()
+        
+        # Extract the query by removing the language markers
+        language_pattern = r"```(\w+)\s+(.*?)```"
+        match = re.search(language_pattern, output_query, re.DOTALL)
+        if match:
+            output_query = match.group(2).strip()
+
+        return reasoning, output_query
@@ -37,6 +37,7 @@
 from data_formulator.tables_routes import tables_bp
 from data_formulator.agent_routes import agent_bp
 
+
 app = Flask(__name__, static_url_path='', static_folder=os.path.join(APP_ROOT, "dist"))
 app.secret_key = secrets.token_hex(16)  # Generate a random secret key for sessions
 
 
@@ -0,0 +1,36 @@
+## Data Loader Module
+
+This module provides a framework for loading data from various external sources into DuckDB. It follows an abstract base class pattern to ensure consistent implementation across different data sources.
+
+### Building a New Data Loader
+
+The abstract class `ExternalDataLoader` defines the data loader interface. Each concrete implementation (e.g., `KustoDataLoader`, `MySQLDataLoader`) handles specific data source connections and data ingestion.
+
+To create a new data loader:
+
+1. Create a new class that inherits from `ExternalDataLoader`
+2. Implement the required abstract methods:
+   - `list_params()`: Define required connection parameters
+   - `__init__()`: Initialize connection to data source
+   - `list_tables()`: List available tables/views
+   - `ingest_data()`: Load data from source
+   - `view_query_sample()`: Preview query results
+   - `ingest_data_from_query()`: Load data from custom query
+3. Register the new class into `__init__.py` so that the front-end can automatically discover the new data loader.
+
+The UI automatically provide the query completion option to help user generate queries for the given data loader (from NL or partial queries).
+
+### Example Implementations
+
+- `KustoDataLoader`: Azure Data Explorer (Kusto) integration
+- `MySQLDataLoader`: MySQL database integration
+
+### Testing
+
+Ensure your implementation:
+- Handles connection errors gracefully
+- Properly sanitizes table names
+- Respects size limits for data ingestion
+- Returns consistent metadata format
+
+Launch the front-end and test the data loader.
@@ -0,0 +1,10 @@
+from data_formulator.data_loader.external_data_loader import ExternalDataLoader
+from data_formulator.data_loader.mysql_data_loader import MySQLDataLoader
+from data_formulator.data_loader.kusto_data_loader import KustoDataLoader
+
+DATA_LOADERS = {
+    "mysql": MySQLDataLoader,
+    "kusto": KustoDataLoader
+}
+
+__all__ = ["ExternalDataLoader", "MySQLDataLoader", "KustoDataLoader", "DATA_LOADERS"]
Original file line number	Diff line number	Diff line change
`@@ -165,7 +165,7 @@ def process_gpt_response(self, input_tables, messages, response):`
`165`	`165`	`if result['status'] == 'ok':`
`166`	`166`	`result_df = result['content']`
`167`	`167`	`result['content'] = {`
`168`		`- 'rows': result_df.to_dict(orient='records'),`
	`168`	`+ 'rows': json.loads(result_df.to_json(orient='records')),`
`169`	`169`	`}`
`170`	`170`	`else:`
`171`	`171`	`logger.info(result['content'])`