Skip to content

Commit bb47180

Browse files
authored
[deploy] Merge pull request #176 from microsoft/dev
0.2.2 -- Starting with an analysis goal, or get suggestions from the agent.
2 parents 42be5f4 + a288407 commit bb47180

21 files changed

+774
-169
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,11 @@ Any questions? Ask on the Discord channel! [![Discord](https://img.shields.io/ba
2525

2626

2727
## News 🔥🔥🔥
28+
- [07-10-2025] Data Formulator 0.2.2: Start with an analysis goal
29+
- Some key frontend performance updates.
30+
- You can start your exploration with a goal, or, tab and see if the agent can recommend some good exploration ideas for you. [Demo](https://github.com/microsoft/data-formulator/pull/176)
2831

29-
- [05-13-2025] Data Formulator 0.2.3 / 0.2.4: External Data Loader
32+
- [05-13-2025] Data Formulator 0.2.1.3/4: External Data Loader
3033
- We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155)
3134
- Current data loaders: MySQL, Azure Data Explorer (Kusto), Azure Blob and Amazon S3 (json, parquet, csv).
3235
- [07-01-2025] Updated with: Postgresql, mssql.

py-src/data_formulator/agents/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,5 @@
2020
"SQLDataRecAgent",
2121
"DataLoadAgent",
2222
"SortDataAgent",
23-
"DataCleanAgent"
23+
"DataCleanAgent",
2424
]

py-src/data_formulator/agents/agent_data_load.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212

1313

1414
SYSTEM_PROMPT = '''You are a data scientist to help user infer data types based off the table provided by the user.
15-
Given a dataset provided by the user, identify their type and semantic type, and provide a very short summary of the dataset.
15+
Given a dataset provided by the user,
16+
1. identify their type and semantic type
17+
2. provide a very short summary of the dataset.
18+
3. provide a list of (5-10) explorative questions that can help users get started with data visualizations.
1619
1720
Types to consider include: string, number, date
1821
Semantic types to consider include: Location, Year, Month, Day, Date, Time, DateTime, Range, Duration, Name, Percentage, String, Number
@@ -34,7 +37,8 @@
3437
"field2": {"type": ..., "semantic_type": ..., "sort_order": null},
3538
...
3639
},
37-
"data summary": ... // a short summary of the data
40+
"data summary": ... // a short summary of the data,
41+
"explorative_questions": [...], // a list of explorative questions that can help users get started with data visualizations
3842
}
3943
```
4044
'''
@@ -76,7 +80,13 @@
7680
"total": {"type": "number", "semantic_type": "Number", "sort_order": null},
7781
"group": {"type": "string", "semantic_type": "Range", "sort_order": ["<10000", "10000 to 14999", "15000 to 24999", "25000 to 34999", "35000 to 49999", "50000 to 74999", "75000 to 99999", "100000 to 149999", "150000 to 199999", "200000+"]}
7882
},
79-
"data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups."
83+
"data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups.",
84+
"explorative_questions": [
85+
"What is the average income across different states?",
86+
"What is the distribution of income across different regions?",
87+
"What is the relationship between income and state ID?",
88+
"What is the relationship between income and region?"
89+
]
8090
}
8191
```
8292
@@ -121,7 +131,13 @@
121131
"sort_order": null
122132
}
123133
},
124-
"data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format."
134+
"data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format.",
135+
"explorative_questions": [
136+
"What is the average temperature across different cities?",
137+
"What is the distribution of temperature across different dates?",
138+
"What is the relationship between temperature and city?",
139+
"What is the relationship between temperature and date?"
140+
]
125141
}```'''
126142

127143
class DataLoadAgent(object):

py-src/data_formulator/agents/agent_query_completion.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def __init__(self, client):
5454

5555
def run(self, data_source_metadata, query):
5656

57-
user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n[REASONING]\n"
57+
user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n"
5858

5959
logger.info(user_query)
6060

@@ -63,11 +63,11 @@ def run(self, data_source_metadata, query):
6363

6464
###### the part that calls open_ai
6565
response = self.client.get_completion(messages = messages)
66-
response_content = '[REASONING]\n' + response.choices[0].message.content
66+
response_content = response.choices[0].message.content
6767

6868
logger.info(f"=== query completion output ===>\n{response_content}\n")
6969

70-
reasoning = extract_json_objects(response_content.split("[REASONING]")[1].split("[QUERY]")[0].strip())[0]
70+
reasoning = extract_json_objects(response_content.split("[QUERY]")[0].strip())[0]
7171
output_query = response_content.split("[QUERY]")[1].strip()
7272

7373
# Extract the query by removing the language markers

py-src/data_formulator/data_loader/azure_blob_data_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,4 +369,4 @@ def ingest_data_from_query(self, query: str, name_as: str):
369369
# Execute the query and get results as a DataFrame
370370
df = self.duck_db_conn.execute(query).df()
371371
# Use the base class's method to ingest the DataFrame
372-
self.ingest_df_to_duckdb(df, name_as)
372+
self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))

py-src/data_formulator/data_loader/mssql_data_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,7 +445,7 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
445445
try:
446446
df = self._execute_query(query)
447447
# Use the base class's method to ingest the DataFrame
448-
self.ingest_df_to_duckdb(df, name_as)
448+
self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
449449
log.info(f"Successfully ingested {len(df)} rows from custom query to {name_as}")
450450
return df
451451
except Exception as e:

py-src/data_formulator/data_loader/mysql_data_loader.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti
6363
try:
6464
self.duck_db_conn.execute("DETACH mysqldb;")
6565
except:
66-
pass # Ignore if mysqldb doesn't exist # Register MySQL connection
66+
pass # Ignore if mysqldb doesn't exist
67+
68+
# Register MySQL connection
6769
self.duck_db_conn.execute(f"ATTACH '{attach_string}' AS mysqldb (TYPE mysql);")
6870

6971
def list_tables(self, table_filter: str = None):
@@ -129,4 +131,4 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
129131
# Execute the query and get results as a DataFrame
130132
df = self.duck_db_conn.execute(query).df()
131133
# Use the base class's method to ingest the DataFrame
132-
self.ingest_df_to_duckdb(df, name_as)
134+
self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))

py-src/data_formulator/data_loader/postgresql_data_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,5 +128,5 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
128128
# Execute the query and get results as a DataFrame
129129
df = self.duck_db_conn.execute(query).df()
130130
# Use the base class's method to ingest the DataFrame
131-
self.ingest_df_to_duckdb(df, name_as)
131+
self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
132132
return df

py-src/data_formulator/data_loader/s3_data_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,4 +203,4 @@ def ingest_data_from_query(self, query: str, name_as: str):
203203
# Execute the query and get results as a DataFrame
204204
df = self.duck_db_conn.execute(query).df()
205205
# Use the base class's method to ingest the DataFrame
206-
self.ingest_df_to_duckdb(df, name_as)
206+
self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "data_formulator"
7-
version = "0.2.1.5"
7+
version = "0.2.2"
88

99
requires-python = ">=3.9"
1010
authors = [

0 commit comments

Comments
 (0)