Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@ Any questions? Ask on the Discord channel! [![Discord](https://img.shields.io/ba


## News 🔥🔥🔥
- [07-10-2025] Data Formulator 0.2.2: Start with an analysis goal
- Some key frontend performance updates.
- You can start your exploration with a goal, or, tab and see if the agent can recommend some good exploration ideas for you. [Demo](https://github.com/microsoft/data-formulator/pull/176)

- [05-13-2025] Data Formulator 0.2.3 / 0.2.4: External Data Loader
- [05-13-2025] Data Formulator 0.2.1.3/4: External Data Loader
- We introduced external data loader class to make import data easier. [Readme](https://github.com/microsoft/data-formulator/tree/main/py-src/data_formulator/data_loader) and [Demo](https://github.com/microsoft/data-formulator/pull/155)
- Current data loaders: MySQL, Azure Data Explorer (Kusto), Azure Blob and Amazon S3 (json, parquet, csv).
- [07-01-2025] Updated with: Postgresql, mssql.
Expand Down
2 changes: 1 addition & 1 deletion py-src/data_formulator/agents/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@
"SQLDataRecAgent",
"DataLoadAgent",
"SortDataAgent",
"DataCleanAgent"
"DataCleanAgent",
]
24 changes: 20 additions & 4 deletions py-src/data_formulator/agents/agent_data_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@


SYSTEM_PROMPT = '''You are a data scientist to help user infer data types based off the table provided by the user.
Given a dataset provided by the user, identify their type and semantic type, and provide a very short summary of the dataset.
Given a dataset provided by the user,
1. identify their type and semantic type
2. provide a very short summary of the dataset.
3. provide a list of (5-10) explorative questions that can help users get started with data visualizations.

Types to consider include: string, number, date
Semantic types to consider include: Location, Year, Month, Day, Date, Time, DateTime, Range, Duration, Name, Percentage, String, Number
Expand All @@ -34,7 +37,8 @@
"field2": {"type": ..., "semantic_type": ..., "sort_order": null},
...
},
"data summary": ... // a short summary of the data
"data summary": ... // a short summary of the data,
"explorative_questions": [...], // a list of explorative questions that can help users get started with data visualizations
}
```
'''
Expand Down Expand Up @@ -76,7 +80,13 @@
"total": {"type": "number", "semantic_type": "Number", "sort_order": null},
"group": {"type": "string", "semantic_type": "Range", "sort_order": ["<10000", "10000 to 14999", "15000 to 24999", "25000 to 34999", "35000 to 49999", "50000 to 74999", "75000 to 99999", "100000 to 149999", "150000 to 199999", "200000+"]}
},
"data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups."
"data summary": "The dataset contains information about income distribution across different states in the USA. It includes fields for state names, regions, state IDs, percentage of total income, total income, and income groups.",
"explorative_questions": [
"What is the average income across different states?",
"What is the distribution of income across different regions?",
"What is the relationship between income and state ID?",
"What is the relationship between income and region?"
]
}
```

Expand Down Expand Up @@ -121,7 +131,13 @@
"sort_order": null
}
},
"data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format."
"data_summary": "This dataset contains weather information for the cities of Seattle and Atlanta. The fields include the date, city name, and temperature readings. The 'Date' field represents dates in a string format, the 'City' field represents city names, and the 'Temperature' field represents temperature values in integer format.",
"explorative_questions": [
"What is the average temperature across different cities?",
"What is the distribution of temperature across different dates?",
"What is the relationship between temperature and city?",
"What is the relationship between temperature and date?"
]
}```'''

class DataLoadAgent(object):
Expand Down
6 changes: 3 additions & 3 deletions py-src/data_formulator/agents/agent_query_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def __init__(self, client):

def run(self, data_source_metadata, query):

user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n[REASONING]\n"
user_query = f"[DATA SOURCE]\n\n{json.dumps(data_source_metadata, indent=2)}\n\n[USER INPUTS]\n\n{query}\n\n"

logger.info(user_query)

Expand All @@ -63,11 +63,11 @@ def run(self, data_source_metadata, query):

###### the part that calls open_ai
response = self.client.get_completion(messages = messages)
response_content = '[REASONING]\n' + response.choices[0].message.content
response_content = response.choices[0].message.content

logger.info(f"=== query completion output ===>\n{response_content}\n")

reasoning = extract_json_objects(response_content.split("[REASONING]")[1].split("[QUERY]")[0].strip())[0]
reasoning = extract_json_objects(response_content.split("[QUERY]")[0].strip())[0]
output_query = response_content.split("[QUERY]")[1].strip()

# Extract the query by removing the language markers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,4 +369,4 @@ def ingest_data_from_query(self, query: str, name_as: str):
# Execute the query and get results as a DataFrame
df = self.duck_db_conn.execute(query).df()
# Use the base class's method to ingest the DataFrame
self.ingest_df_to_duckdb(df, name_as)
self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
2 changes: 1 addition & 1 deletion py-src/data_formulator/data_loader/mssql_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
try:
df = self._execute_query(query)
# Use the base class's method to ingest the DataFrame
self.ingest_df_to_duckdb(df, name_as)
self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
log.info(f"Successfully ingested {len(df)} rows from custom query to {name_as}")
return df
except Exception as e:
Expand Down
6 changes: 4 additions & 2 deletions py-src/data_formulator/data_loader/mysql_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ def __init__(self, params: Dict[str, Any], duck_db_conn: duckdb.DuckDBPyConnecti
try:
self.duck_db_conn.execute("DETACH mysqldb;")
except:
pass # Ignore if mysqldb doesn't exist # Register MySQL connection
pass # Ignore if mysqldb doesn't exist

# Register MySQL connection
self.duck_db_conn.execute(f"ATTACH '{attach_string}' AS mysqldb (TYPE mysql);")

def list_tables(self, table_filter: str = None):
Expand Down Expand Up @@ -129,4 +131,4 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
# Execute the query and get results as a DataFrame
df = self.duck_db_conn.execute(query).df()
# Use the base class's method to ingest the DataFrame
self.ingest_df_to_duckdb(df, name_as)
self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
Original file line number Diff line number Diff line change
Expand Up @@ -128,5 +128,5 @@ def ingest_data_from_query(self, query: str, name_as: str) -> pd.DataFrame:
# Execute the query and get results as a DataFrame
df = self.duck_db_conn.execute(query).df()
# Use the base class's method to ingest the DataFrame
self.ingest_df_to_duckdb(df, name_as)
self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
return df
2 changes: 1 addition & 1 deletion py-src/data_formulator/data_loader/s3_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,4 +203,4 @@ def ingest_data_from_query(self, query: str, name_as: str):
# Execute the query and get results as a DataFrame
df = self.duck_db_conn.execute(query).df()
# Use the base class's method to ingest the DataFrame
self.ingest_df_to_duckdb(df, name_as)
self.ingest_df_to_duckdb(df, sanitize_table_name(name_as))
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "data_formulator"
version = "0.2.1.5"
version = "0.2.2"

requires-python = ">=3.9"
authors = [
Expand Down
2 changes: 1 addition & 1 deletion src/app/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ export const AppFC: FC<AppFCProps> = function AppFC(appProps) {
//user is not logged in, do not show logout button
//console.error(err)
});
}, [])
}, []);

useEffect(() => {
document.title = toolName;
Expand Down
39 changes: 24 additions & 15 deletions src/app/dfSlice.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@ import { handleSSEMessage } from './SSEActions';

enableMapSet();

export const generateFreshChart = (tableRef: string, chartType?: string, source: "user" | "trigger" = "user") : Chart => {
let realChartType = chartType || "?"
export const generateFreshChart = (tableRef: string, chartType: string, source: "user" | "trigger" = "user") : Chart => {
return {
id: `chart-${Date.now()- Math.floor(Math.random() * 10000)}`,
chartType: realChartType,
encodingMap: Object.assign({}, ...getChartChannels(realChartType).map((channel) => ({ [channel]: { channel: channel, bin: false } }))),
chartType: chartType,
encodingMap: Object.assign({}, ...getChartChannels(chartType).map((channel) => ({ [channel]: { channel: channel, bin: false } }))),
tableRef: tableRef,
saved: false,
source: source,
Expand All @@ -45,12 +44,11 @@ export interface ModelConfig {
}

// Define model slot types
export type ModelSlotType = 'generation' | 'hint';
export const MODEL_SLOT_TYPES = ['generation', 'hint'] as const;
export type ModelSlotType = typeof MODEL_SLOT_TYPES[number];

export interface ModelSlots {
generation?: string; // model id assigned to generation tasks
hint?: string; // model id assigned to hint tasks
}
// Derive ModelSlots interface from the constant
export type ModelSlots = Partial<Record<ModelSlotType, string>>;

// Define a type for the slice state
export interface DataFormulatorState {
Expand Down Expand Up @@ -271,7 +269,7 @@ export const dataFormulatorSlice = createSlice({
// avoid resetting inputted models
// state.oaiModels = state.oaiModels.filter((m: any) => m.endpoint != 'default');

state.modelSlots = {};
// state.modelSlots = {};
state.testedModels = [];

state.tables = [];
Expand Down Expand Up @@ -358,11 +356,13 @@ export const dataFormulatorSlice = createSlice({
},
loadTable: (state, action: PayloadAction<DictTable>) => {
let table = action.payload;
let freshChart = generateFreshChart(table.id, '?') as Chart;
state.tables = [...state.tables, table];
state.charts = [...state.charts, freshChart];
state.conceptShelfItems = [...state.conceptShelfItems, ...getDataFieldItems(table)];

state.focusedTableId = table.id;
state.focusedChartId = undefined;
state.focusedChartId = freshChart.id;
},
deleteTable: (state, action: PayloadAction<string>) => {
let tableId = action.payload;
Expand Down Expand Up @@ -452,7 +452,7 @@ export const dataFormulatorSlice = createSlice({
});
}
},
createNewChart: (state, action: PayloadAction<{chartType?: string, tableId?: string}>) => {
createNewChart: (state, action: PayloadAction<{chartType: string, tableId: string}>) => {
let chartType = action.payload.chartType;
let tableId = action.payload.tableId || state.tables[0].id;
let freshChart = generateFreshChart(tableId, chartType, "user") as Chart;
Expand Down Expand Up @@ -745,6 +745,11 @@ export const dataFormulatorSlice = createSlice({
return field;
}
})

if (data["result"][0]["explorative_questions"] && data["result"][0]["explorative_questions"].length > 0) {
let table = state.tables.find(t => t.id == tableId) as DictTable;
table.explorativeQuestions = data["result"][0]["explorative_questions"] as string[];
}
}
})
.addCase(fetchAvailableModels.fulfilled, (state, action) => {
Expand All @@ -763,8 +768,12 @@ export const dataFormulatorSlice = createSlice({
...state.testedModels.filter(t => !defaultModels.map((m: ModelConfig) => m.id).includes(t.id))
]

if (state.modelSlots.generation == undefined && defaultModels.length > 0) {
state.modelSlots.generation = defaultModels[0].id;
if (defaultModels.length > 0) {
for (const slotType of MODEL_SLOT_TYPES) {
if (state.modelSlots[slotType] == undefined) {
state.modelSlots[slotType] = defaultModels[0].id;
}
}
}

// console.log("load model complete");
Expand Down Expand Up @@ -796,7 +805,7 @@ export const dfSelectors = {
return modelId ? state.models.find(m => m.id === modelId) : undefined;
},
getAllSlotTypes: () : ModelSlotType[] => {
return ['generation', 'hint'];
return [...MODEL_SLOT_TYPES];
},
getActiveBaseTableIds: (state: DataFormulatorState) => {
let focusedTableId = state.focusedTableId;
Expand Down
7 changes: 5 additions & 2 deletions src/components/ComponentType.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,16 @@ export interface DictTable {
rowCount: number; // total number of rows in the full table
};
anchored: boolean; // whether this table is anchored as a persistent table used to derive other tables
explorativeQuestions: string[]; // a list of (3-5) explorative questions that can help users get started with data visualizations
}

export function createDictTable(
id: string, rows: any[],
derive: {code: string, codeExpl: string, source: string[], dialog: any[],
trigger: Trigger} | undefined = undefined,
virtual: {tableId: string, rowCount: number} | undefined = undefined,
anchored: boolean = false) : DictTable {
anchored: boolean = false,
explorativeQuestions: string[] = []) : DictTable {

let names = Object.keys(rows[0])

Expand All @@ -95,7 +97,8 @@ export function createDictTable(
types: names.map(name => inferTypeFromValueArray(rows.map(r => r[name]))),
derive,
virtual,
anchored
anchored,
explorativeQuestions
}
}

Expand Down
Loading