microsoft · Chenglong-MS · Jul 11, 2025 · Jul 3, 2025 · Jul 7, 2025 · Jul 9, 2025
diff --git a/README.md b/README.md
@@ -25,8 +25,11 @@ Any questions? Ask on the Discord channel! [![Discord](https://img.shields.io/ba
 
 
 ## News 🔥🔥🔥
+- [07-10-2025] Data Formulator 0.2.2: Start with an analysis goal
+  - Some key frontend performance updates. 
+  - You can start your exploration with a goal, or, tab and see if the agent can recommend some good exploration ideas for you. [Demo](https://github.com/microsoft/data-formulator/pull/176)
 
-- [05-13-2025] Data Formulator 0.2.3 / 0.2.4: External Data Loader 
+- [05-13-2025] Data Formulator 0.2.1.3/4: External Data Loader 
   - We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155)
     - Current data loaders: MySQL, Azure Data Explorer (Kusto), Azure Blob and Amazon S3 (json, parquet, csv).
     - [07-01-2025] Updated with: Postgresql, mssql.

diff --git a/py-src/data_formulator/agents/__init__.py b/py-src/data_formulator/agents/__init__.py
@@ -20,5 +20,5 @@
     "SQLDataRecAgent",
     "DataLoadAgent",
     "SortDataAgent",
-    "DataCleanAgent"
+    "DataCleanAgent",
 ]
diff --git a/py-src/data_formulator/agents/agent_data_load.py b/py-src/data_formulator/agents/agent_data_load.py
@@ -12,7 +12,10 @@
 
 
 SYSTEM_PROMPT = '''You are a data scientist to help user infer data types based off the table provided by the user.
-Given a dataset provided by the user, identify their type and semantic type, and provide a very short summary of the dataset.
+Given a dataset provided by the user, 
+1. identify their type and semantic type
+2. provide a very short summary of the dataset.
+3. provide a list of (5-10) explorative questions that can help users get started with data visualizations.
 
 Types to consider include: string, number, date
 Semantic types to consider include: Location, Year, Month, Day, Date, Time, DateTime, Range, Duration, Name, Percentage, String, Number
@@ -34,7 +37,8 @@
         "field2": {"type": ..., "semantic_type": ..., "sort_order": null}, 
         ...
     },
-    "data summary": ... // a short summary of the data
+    "data summary": ... // a short summary of the data,
+    "explorative_questions": [...], // a list of explorative questions that can help users get started with data visualizations
 }
 ```
 '''
@@ -76,7 +80,13 @@
         "total": {"type": "number", "semantic_type": "Number", "sort_order": null},
         "group": {"type": "string", "semantic_type": "Range", "sort_order": ["<10000", "10000 to 14999", "15000 to 24999", "25000 to 34999", "35000 to 49999", "50000 to 74999", "75000 to 99999", "100000 to 149999", "150000 to 199999", "200000+"]}
     },
-    "data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups."
+    "data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups.",
+    "explorative_questions": [
+        "What is the average income across different states?",
+        "What is the distribution of income across different regions?",
+        "What is the relationship between income and state ID?",
+        "What is the relationship between income and region?"
+    ]
 }
 ```
 
@@ -121,7 +131,13 @@
             "sort_order": null  
         }  
     },  
-    "data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format."  
+    "data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format.",
+    "explorative_questions": [
+        "What is the average temperature across different cities?",
+        "What is the distribution of temperature across different dates?",
+        "What is the relationship between temperature and city?",
+        "What is the relationship between temperature and date?"
+    ]
 }```'''
 
 class DataLoadAgent(object):

diff --git a/py-src/data_formulator/agents/agent_query_completion.py b/py-src/data_formulator/agents/agent_query_completion.py
@@ -54,7 +54,7 @@ def __init__(self, client):
 
     def run(self, data_source_metadata, query):
 
-        user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n[REASONING]\n"
+        user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n"
 
         logger.info(user_query)
 
@@ -63,11 +63,11 @@ def run(self, data_source_metadata, query):
 
         ###### the part that calls open_ai
         response = self.client.get_completion(messages = messages)
-        response_content = '[REASONING]\n' + response.choices[0].message.content
+        response_content = response.choices[0].message.content
 
         logger.info(f"=== query completion output ===>\n{response_content}\n")
 
-        reasoning = extract_json_objects(response_content.split("[REASONING]")[1].split("[QUERY]")[0].strip())[0]
+        reasoning = extract_json_objects(response_content.split("[QUERY]")[0].strip())[0]
         output_query = response_content.split("[QUERY]")[1].strip()
 
         # Extract the query by removing the language markers

diff --git a/py-src/data_formulator/data_loader/azure_blob_data_loader.py b/py-src/data_formulator/data_loader/azure_blob_data_loader.py
@@ -369,4 +369,4 @@ def ingest_data_from_query(self, query: str, name_as: str):
         # Execute the query and get results as a DataFrame
         df = self.duck_db_conn.execute(query).df()
         # Use the base class's method to ingest the DataFrame
-        self.ingest_df_to_duckdb(df, name_as)
+        self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
diff --git a/py-src/data_formulator/data_loader/mssql_data_loader.py b/py-src/data_formulator/data_loader/mssql_data_loader.py
@@ -445,7 +445,7 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
         try:
             df = self._execute_query(query)
             # Use the base class's method to ingest the DataFrame
-            self.ingest_df_to_duckdb(df, name_as)
+            self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
             log.info(f"Successfully ingested {len(df)} rows from custom query to {name_as}")
             return df
         except Exception as e:

diff --git a/py-src/data_formulator/data_loader/mysql_data_loader.py b/py-src/data_formulator/data_loader/mysql_data_loader.py
@@ -63,7 +63,9 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti
         try:
             self.duck_db_conn.execute("DETACH mysqldb;")
         except:
-            pass  # Ignore if mysqldb doesn't exist        # Register MySQL connection
+            pass  # Ignore if mysqldb doesn't exist
+
+        # Register MySQL connection
         self.duck_db_conn.execute(f"ATTACH '{attach_string}' AS mysqldb (TYPE mysql);")
 
     def list_tables(self, table_filter: str = None):
@@ -129,4 +131,4 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
         # Execute the query and get results as a DataFrame
         df = self.duck_db_conn.execute(query).df()
         # Use the base class's method to ingest the DataFrame
-        self.ingest_df_to_duckdb(df, name_as)
+        self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
diff --git a/py-src/data_formulator/data_loader/postgresql_data_loader.py b/py-src/data_formulator/data_loader/postgresql_data_loader.py
@@ -128,5 +128,5 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
         # Execute the query and get results as a DataFrame
         df = self.duck_db_conn.execute(query).df()
         # Use the base class's method to ingest the DataFrame
-        self.ingest_df_to_duckdb(df, name_as)
+        self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
         return df
diff --git a/py-src/data_formulator/data_loader/s3_data_loader.py b/py-src/data_formulator/data_loader/s3_data_loader.py
@@ -203,4 +203,4 @@ def ingest_data_from_query(self, query: str, name_as: str):
         # Execute the query and get results as a DataFrame
         df = self.duck_db_conn.execute(query).df()
         # Use the base class's method to ingest the DataFrame
-        self.ingest_df_to_duckdb(df, name_as)
+        self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "data_formulator"
-version = "0.2.1.5"
+version = "0.2.2"
 
 requires-python = ">=3.9"
 authors = [

diff --git a/src/app/App.tsx b/src/app/App.tsx
@@ -519,7 +519,7 @@ export const AppFC: FC<AppFCProps> = function AppFC(appProps) {
                 //user is not logged in, do not show logout button
                 //console.error(err)
             });
-    }, [])
+    }, []);
 
     useEffect(() => {
         document.title = toolName;

diff --git a/src/app/dfSlice.tsx b/src/app/dfSlice.tsx
@@ -16,12 +16,11 @@ import { handleSSEMessage } from './SSEActions';
 
 enableMapSet();
 
-export const generateFreshChart = (tableRef: string, chartType?: string, source: "user" | "trigger" = "user") : Chart => {
-    let realChartType = chartType || "?"
+export const generateFreshChart = (tableRef: string, chartType: string, source: "user" | "trigger" = "user") : Chart => {
     return { 
         id: `chart-${Date.now()- Math.floor(Math.random() * 10000)}`, 
-        chartType: realChartType, 
-        encodingMap: Object.assign({}, ...getChartChannels(realChartType).map((channel) => ({ [channel]: { channel: channel, bin: false } }))),
+        chartType: chartType, 
+        encodingMap: Object.assign({}, ...getChartChannels(chartType).map((channel) => ({ [channel]: { channel: channel, bin: false } }))),
         tableRef: tableRef,
         saved: false,
         source: source,
@@ -45,12 +44,11 @@ export interface ModelConfig {
 }
 
 // Define model slot types
-export type ModelSlotType = 'generation' | 'hint';
+export const MODEL_SLOT_TYPES = ['generation', 'hint'] as const;
+export type ModelSlotType = typeof MODEL_SLOT_TYPES[number];
 
-export interface ModelSlots {
-    generation?: string; // model id assigned to generation tasks
-    hint?: string; // model id assigned to hint tasks
-}
+// Derive ModelSlots interface from the constant
+export type ModelSlots = Partial<Record<ModelSlotType, string>>;
 
 // Define a type for the slice state
 export interface DataFormulatorState {
@@ -271,7 +269,7 @@ export const dataFormulatorSlice = createSlice({
             // avoid resetting inputted models
             // state.oaiModels = state.oaiModels.filter((m: any) => m.endpoint != 'default');
 
-            state.modelSlots = {};
+            // state.modelSlots = {};
             state.testedModels = [];
 
             state.tables = [];
@@ -358,11 +356,13 @@ export const dataFormulatorSlice = createSlice({
         },
         loadTable: (state, action: PayloadAction<DictTable>) => {
             let table = action.payload;
+            let freshChart = generateFreshChart(table.id, '?') as Chart;
             state.tables = [...state.tables, table];
+            state.charts = [...state.charts, freshChart];
             state.conceptShelfItems = [...state.conceptShelfItems, ...getDataFieldItems(table)];
 
             state.focusedTableId = table.id;
-            state.focusedChartId = undefined;
+            state.focusedChartId = freshChart.id;
         },
         deleteTable: (state, action: PayloadAction<string>) => {
             let tableId = action.payload;
@@ -452,7 +452,7 @@ export const dataFormulatorSlice = createSlice({
                 });
             }
         },
-        createNewChart: (state, action: PayloadAction<{chartType?: string, tableId?: string}>) => {
+        createNewChart: (state, action: PayloadAction<{chartType: string, tableId: string}>) => {
             let chartType = action.payload.chartType;
             let tableId = action.payload.tableId || state.tables[0].id;
             let freshChart = generateFreshChart(tableId, chartType, "user") as Chart;
@@ -745,6 +745,11 @@ export const dataFormulatorSlice = createSlice({
                         return field;
                     }
                 })
+
+                if (data["result"][0]["explorative_questions"] && data["result"][0]["explorative_questions"].length > 0) {
+                    let table = state.tables.find(t => t.id == tableId) as DictTable;
+                    table.explorativeQuestions = data["result"][0]["explorative_questions"] as string[];
+                }
             }
         })
         .addCase(fetchAvailableModels.fulfilled, (state, action) => {
@@ -763,8 +768,12 @@ export const dataFormulatorSlice = createSlice({
                 ...state.testedModels.filter(t => !defaultModels.map((m: ModelConfig) => m.id).includes(t.id))
             ]
 
-            if (state.modelSlots.generation == undefined && defaultModels.length > 0) {
-                state.modelSlots.generation = defaultModels[0].id;
+            if (defaultModels.length > 0) {
+                for (const slotType of MODEL_SLOT_TYPES) {
+                    if (state.modelSlots[slotType] == undefined) {
+                        state.modelSlots[slotType] = defaultModels[0].id;
+                    }
+                }
             }
 
             // console.log("load model complete");
@@ -796,7 +805,7 @@ export const dfSelectors = {
         return modelId ? state.models.find(m => m.id === modelId) : undefined;
     },
     getAllSlotTypes: () : ModelSlotType[] => {
-        return ['generation', 'hint'];
+        return [...MODEL_SLOT_TYPES];
     },
     getActiveBaseTableIds: (state: DataFormulatorState) => {
         let focusedTableId = state.focusedTableId;

diff --git a/src/components/ComponentType.tsx b/src/components/ComponentType.tsx
@@ -76,14 +76,16 @@ export interface DictTable {
         rowCount: number; // total number of rows in the full table
     };
     anchored: boolean; // whether this table is anchored as a persistent table used to derive other tables
+    explorativeQuestions: string[]; // a list of (3-5) explorative questions that can help users get started with data visualizations
 }
 
 export function createDictTable(
     id: string, rows: any[], 
     derive: {code: string, codeExpl: string, source: string[], dialog: any[], 
              trigger: Trigger} | undefined = undefined,
     virtual: {tableId: string, rowCount: number} | undefined = undefined,
-    anchored: boolean = false) : DictTable {
+    anchored: boolean = false,
+    explorativeQuestions: string[] = []) : DictTable {
 
     let names = Object.keys(rows[0])
 
@@ -95,7 +97,8 @@ export function createDictTable(
         types: names.map(name => inferTypeFromValueArray(rows.map(r => r[name]))),
         derive,
         virtual,
-        anchored
+        anchored,
+        explorativeQuestions
     }
 }