microsoft
diff --git a/‎README.md
Lines changed: 4 additions & 1 deletion b/‎README.md
Lines changed: 4 additions & 1 deletion
diff --git a/‎py-src/data_formulator/agents/__init__.py
Lines changed: 1 addition & 1 deletion b/‎py-src/data_formulator/agents/__init__.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎py-src/data_formulator/agents/agent_data_load.py
Lines changed: 20 additions & 4 deletions b/‎py-src/data_formulator/agents/agent_data_load.py
Lines changed: 20 additions & 4 deletions
diff --git a/‎py-src/data_formulator/agents/agent_query_completion.py
Lines changed: 3 additions & 3 deletions b/‎py-src/data_formulator/agents/agent_query_completion.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎py-src/data_formulator/data_loader/azure_blob_data_loader.py
Lines changed: 1 addition & 1 deletion b/‎py-src/data_formulator/data_loader/azure_blob_data_loader.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎py-src/data_formulator/data_loader/mssql_data_loader.py
Lines changed: 1 addition & 1 deletion b/‎py-src/data_formulator/data_loader/mssql_data_loader.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎py-src/data_formulator/data_loader/mysql_data_loader.py
Lines changed: 4 additions & 2 deletions b/‎py-src/data_formulator/data_loader/mysql_data_loader.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎py-src/data_formulator/data_loader/postgresql_data_loader.py
Lines changed: 1 addition & 1 deletion b/‎py-src/data_formulator/data_loader/postgresql_data_loader.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎py-src/data_formulator/data_loader/s3_data_loader.py
Lines changed: 1 addition & 1 deletion b/‎py-src/data_formulator/data_loader/s3_data_loader.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml
Lines changed: 1 addition & 1 deletion
@@ -25,8 +25,11 @@ Any questions? Ask on the Discord channel! [![Discord](https://img.shields.io/ba
 
 
 ## News 🔥🔥🔥
+- [07-10-2025] Data Formulator 0.2.2: Start with an analysis goal
+  - Some key frontend performance updates. 
+  - You can start your exploration with a goal, or, tab and see if the agent can recommend some good exploration ideas for you. [Demo](https://github.com/microsoft/data-formulator/pull/176)
 
-- [05-13-2025] Data Formulator 0.2.3 / 0.2.4: External Data Loader 
+- [05-13-2025] Data Formulator 0.2.1.3/4: External Data Loader 
   - We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155)
     - Current data loaders: MySQL, Azure Data Explorer (Kusto), Azure Blob and Amazon S3 (json, parquet, csv).
     - [07-01-2025] Updated with: Postgresql, mssql.
 
@@ -20,5 +20,5 @@
     "SQLDataRecAgent",
     "DataLoadAgent",
     "SortDataAgent",
-    "DataCleanAgent"
+    "DataCleanAgent",
 ]
@@ -12,7 +12,10 @@
 
 
 SYSTEM_PROMPT = '''You are a data scientist to help user infer data types based off the table provided by the user.
-Given a dataset provided by the user, identify their type and semantic type, and provide a very short summary of the dataset.
+Given a dataset provided by the user, 
+1. identify their type and semantic type
+2. provide a very short summary of the dataset.
+3. provide a list of (5-10) explorative questions that can help users get started with data visualizations.
 
 Types to consider include: string, number, date
 Semantic types to consider include: Location, Year, Month, Day, Date, Time, DateTime, Range, Duration, Name, Percentage, String, Number
@@ -34,7 +37,8 @@
         "field2": {"type": ..., "semantic_type": ..., "sort_order": null}, 
         ...
     },
-    "data summary": ... // a short summary of the data
+    "data summary": ... // a short summary of the data,
+    "explorative_questions": [...], // a list of explorative questions that can help users get started with data visualizations
 }
 ```
 '''
@@ -76,7 +80,13 @@
         "total": {"type": "number", "semantic_type": "Number", "sort_order": null},
         "group": {"type": "string", "semantic_type": "Range", "sort_order": ["<10000", "10000 to 14999", "15000 to 24999", "25000 to 34999", "35000 to 49999", "50000 to 74999", "75000 to 99999", "100000 to 149999", "150000 to 199999", "200000+"]}
     },
-    "data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups."
+    "data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups.",
+    "explorative_questions": [
+        "What is the average income across different states?",
+        "What is the distribution of income across different regions?",
+        "What is the relationship between income and state ID?",
+        "What is the relationship between income and region?"
+    ]
 }
 ```
 
@@ -121,7 +131,13 @@
             "sort_order": null  
         }  
     },  
-    "data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format."  
+    "data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format.",
+    "explorative_questions": [
+        "What is the average temperature across different cities?",
+        "What is the distribution of temperature across different dates?",
+        "What is the relationship between temperature and city?",
+        "What is the relationship between temperature and date?"
+    ]
 }```'''
 
 class DataLoadAgent(object):
 
@@ -54,7 +54,7 @@ def __init__(self, client):
 
     def run(self, data_source_metadata, query):
 
-        user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n[REASONING]\n"
+        user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n"
 
         logger.info(user_query)
 
@@ -63,11 +63,11 @@ def run(self, data_source_metadata, query):
 
         ###### the part that calls open_ai
         response = self.client.get_completion(messages = messages)
-        response_content = '[REASONING]\n' + response.choices[0].message.content
+        response_content = response.choices[0].message.content
 
         logger.info(f"=== query completion output ===>\n{response_content}\n")
 
-        reasoning = extract_json_objects(response_content.split("[REASONING]")[1].split("[QUERY]")[0].strip())[0]
+        reasoning = extract_json_objects(response_content.split("[QUERY]")[0].strip())[0]
         output_query = response_content.split("[QUERY]")[1].strip()
 
         # Extract the query by removing the language markers
 
@@ -369,4 +369,4 @@ def ingest_data_from_query(self, query: str, name_as: str):
         # Execute the query and get results as a DataFrame
         df = self.duck_db_conn.execute(query).df()
         # Use the base class's method to ingest the DataFrame
-        self.ingest_df_to_duckdb(df, name_as)
+        self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
@@ -445,7 +445,7 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
         try:
             df = self._execute_query(query)
             # Use the base class's method to ingest the DataFrame
-            self.ingest_df_to_duckdb(df, name_as)
+            self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
             log.info(f"Successfully ingested {len(df)} rows from custom query to {name_as}")
             return df
         except Exception as e:
 
@@ -63,7 +63,9 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti
         try:
             self.duck_db_conn.execute("DETACH mysqldb;")
         except:
-            pass  # Ignore if mysqldb doesn't exist        # Register MySQL connection
+            pass  # Ignore if mysqldb doesn't exist
+        
+        # Register MySQL connection
         self.duck_db_conn.execute(f"ATTACH '{attach_string}' AS mysqldb (TYPE mysql);")
 
     def list_tables(self, table_filter: str = None):
@@ -129,4 +131,4 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
         # Execute the query and get results as a DataFrame
         df = self.duck_db_conn.execute(query).df()
         # Use the base class's method to ingest the DataFrame
-        self.ingest_df_to_duckdb(df, name_as)
+        self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
@@ -128,5 +128,5 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
         # Execute the query and get results as a DataFrame
         df = self.duck_db_conn.execute(query).df()
         # Use the base class's method to ingest the DataFrame
-        self.ingest_df_to_duckdb(df, name_as)
+        self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
         return df
@@ -203,4 +203,4 @@ def ingest_data_from_query(self, query: str, name_as: str):
         # Execute the query and get results as a DataFrame
         df = self.duck_db_conn.execute(query).df()
         # Use the base class's method to ingest the DataFrame
-        self.ingest_df_to_duckdb(df, name_as)
+        self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "data_formulator"
-version = "0.2.1.5"
+version = "0.2.2"
 
 requires-python = ">=3.9"
 authors = [
Original file line number	Diff line number	Diff line change
`@@ -20,5 +20,5 @@`
`20`	`20`	`"SQLDataRecAgent",`
`21`	`21`	`"DataLoadAgent",`
`22`	`22`	`"SortDataAgent",`
`23`		`- "DataCleanAgent"`
	`23`	`+ "DataCleanAgent",`
`24`	`24`	`]`