Skip to content

Commit 93cb4da

Browse files
authored
[deploy] 0.1.6.2 Merge pull request #121 from microsoft/dev
1. gracefully work with large data: when chart has too many categorical values on x/y/column/row axes, DF will not automatically cap the number of items displayed. 2. load data from Excel 3. enhanced model configuration.
2 parents b68e136 + 49e6787 commit 93cb4da

14 files changed

+293
-196
lines changed

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@
4545
"vega": "^5.26.0",
4646
"vega-embed": "^6.21.0",
4747
"vega-lite": "^5.5.0",
48-
"vm-browserify": "^1.1.2"
48+
"vm-browserify": "^1.1.2",
49+
"xlsx": "^0.18.5"
4950
},
5051
"scripts": {
5152
"lint": "eslint -c eslint.config.js src/**/*.{ts,tsx} --fix",

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "data_formulator"
7-
version = "0.1.6.1"
7+
version = "0.1.6.2"
88

99
requires-python = ">=3.9"
1010
authors = [

src/app/dfSlice.tsx

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -675,8 +675,6 @@ export const dataFormulatorSlice = createSlice({
675675

676676
console.log("load model complete");
677677
console.log("state.models", state.models);
678-
console.log("state.selectedModelId", state.selectedModelId);
679-
console.log("state.testedModels", state.testedModels);
680678
})
681679
.addCase(fetchCodeExpl.fulfilled, (state, action) => {
682680
let codeExpl = action.payload;

src/app/utils.tsx

Lines changed: 87 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ export function runCodeOnInputListsInVM(
9898
let target = undefined;
9999
try {
100100
// copy args to ensure correctness of mapping
101-
target = func(...JSON.parse(JSON.stringify(args)))
101+
target = func(...structuredClone(args))
102102
} catch (err) {
103103
console.warn(`execution err ${err}`)
104104
}
@@ -179,7 +179,7 @@ export function baseTableToExtTable(table: any[], derivedFields: FieldItem[], al
179179
let args = inputTuples;
180180
if (func.length == baseCols.length * 2 + 1) {
181181
// avoid side effect, use the copy of the column when calling the function
182-
args = [...inputTuples, rowIdx, ...JSON.parse(JSON.stringify(baseCols))]
182+
args = [...inputTuples, rowIdx, ...structuredClone(baseCols)]
183183
}
184184

185185
target = func(...args);
@@ -242,7 +242,13 @@ export function baseTableToExtTable(table: any[], derivedFields: FieldItem[], al
242242
}
243243

244244

245-
export const instantiateVegaTemplate = (chartType: string, encodingMap: { [key in Channel]: EncodingItem; }, allFields: FieldItem[], workingTable: any[]) => {
245+
export const assembleVegaChart = (
246+
chartType: string,
247+
encodingMap: { [key in Channel]: EncodingItem; },
248+
conceptShelfItems: FieldItem[],
249+
workingTable: any[],
250+
maxNominalValues: number = 68
251+
) => {
246252

247253
if (chartType == "Table") {
248254
return ["Table", undefined];
@@ -251,8 +257,7 @@ export const instantiateVegaTemplate = (chartType: string, encodingMap: { [key i
251257
let chartTemplate = getChartTemplate(chartType) as ChartTemplate;
252258
//console.log(chartTemplate);
253259

254-
let vgObj = JSON.parse(JSON.stringify(chartTemplate.template));
255-
const baseTableSchemaObj: any = {};
260+
let vgObj = structuredClone(chartTemplate.template);
256261

257262
for (const [channel, encoding] of Object.entries(encodingMap)) {
258263

@@ -262,30 +267,8 @@ export const instantiateVegaTemplate = (chartType: string, encodingMap: { [key i
262267
encodingObj["scale"] = {"type": "sqrt", "zero": true};
263268
}
264269

265-
const field = encoding.fieldID ? _.find(allFields, (f) => f.id === encoding.fieldID) : undefined;
270+
const field = encoding.fieldID ? _.find(conceptShelfItems, (f) => f.id === encoding.fieldID) : undefined;
266271
if (field) {
267-
//console.log(field)
268-
// the synthesizer only need to see base table schema
269-
let baseFields = (field.source == "derived" ?
270-
(field.transform as ConceptTransformation).parentIDs.map((parentID) => allFields.find((f) => f.id == parentID) as FieldItem)
271-
: [field]);
272-
273-
for (let baseField of baseFields) {
274-
if (Object.keys(baseTableSchemaObj).includes(baseField.name)) {
275-
continue;
276-
}
277-
baseTableSchemaObj[baseField.name] = {
278-
channel,
279-
dtype: getDType(baseField.type, workingTable.map(r => r[baseField.name])),
280-
name: baseField.name,
281-
original: baseField.source == "original",
282-
// domain: {
283-
// values: [...new Set(baseField.domain.values)],
284-
// is_complete: baseField.domain.isComplete
285-
// },
286-
};
287-
}
288-
289272
// create the encoding
290273
encodingObj["field"] = field.name;
291274
encodingObj["type"] = getDType(field.type, workingTable.map(r => r[field.name]));
@@ -428,16 +411,8 @@ export const instantiateVegaTemplate = (chartType: string, encodingMap: { [key i
428411
vgObj = chartTemplate.postProcessor(vgObj, workingTable);
429412
}
430413

431-
// console.log(JSON.stringify(vgObj))
432-
433-
return [vgObj, baseTableSchemaObj];
434-
}
435-
436-
export const assembleChart = (chart: Chart, conceptShelfItems: FieldItem[], dataValues: any[]) => {
437-
438-
let vgSpec: any = instantiateVegaTemplate(chart.chartType, chart.encodingMap, conceptShelfItems, dataValues)[0];
439-
440-
let values = JSON.parse(JSON.stringify(dataValues));
414+
// this is the data that will be assembled into the vega chart
415+
let values = structuredClone(workingTable);
441416
values = values.map((r: any) => {
442417
let keys = Object.keys(r);
443418
let temporalKeys = keys.filter((k: string) => conceptShelfItems.some(concept => concept.name == k && (concept.type == "date" || concept.semanticType == "Year")));
@@ -446,9 +421,82 @@ export const assembleChart = (chart: Chart, conceptShelfItems: FieldItem[], data
446421
}
447422
return r;
448423
})
449-
return {...vgSpec, data: {values: values}}
424+
425+
// Handle nominal axes with many entries
426+
for (const channel of ['x', 'y', 'column', 'row', 'xOffset']) {
427+
const encoding = vgObj.encoding?.[channel];
428+
if (encoding?.type === 'nominal') {
429+
const fieldName = encoding.field;
430+
const uniqueValues = [...new Set(values.map((r: any) => r[fieldName]))];
431+
432+
let valuesToKeep: any[];
433+
if (uniqueValues.length > maxNominalValues) {
434+
435+
if (channel == 'x' || channel == 'y') {
436+
const oppositeChannel = channel === 'x' ? 'y' : 'x';
437+
const oppositeEncoding = vgObj.encoding?.[oppositeChannel];
438+
439+
if (oppositeEncoding?.type === 'quantitative') {
440+
// Sort by the quantitative field and take top maxNominalValues
441+
const quantField = oppositeEncoding.field;
442+
valuesToKeep = uniqueValues
443+
.map(val => ({
444+
value: val,
445+
sum: workingTable
446+
.filter(r => r[fieldName] === val)
447+
.reduce((sum, r) => sum + (r[quantField] || 0), 0)
448+
}))
449+
.sort((a, b) => b.sum - a.sum)
450+
.slice(0, maxNominalValues)
451+
.map(v => v.value);
452+
} else {
453+
// If no quantitative axis, just take first maxNominalValues
454+
valuesToKeep = uniqueValues.slice(0, maxNominalValues);
455+
}
456+
} else if (channel == 'row') {
457+
valuesToKeep = uniqueValues.slice(0, 20);
458+
} else {
459+
valuesToKeep = uniqueValues.slice(0, maxNominalValues);
460+
}
461+
462+
// Filter the working table
463+
const omittedCount = uniqueValues.length - maxNominalValues;
464+
const placeholder = `...${omittedCount} items omitted`;
465+
values = values.filter((row: any) => valuesToKeep.includes(row[fieldName]));
466+
467+
// Add text formatting configuration
468+
if (!encoding.axis) {
469+
encoding.axis = {};
470+
}
471+
encoding.axis.labelColor = {
472+
condition: {
473+
test: `datum.label == '${placeholder}'`,
474+
value: "#999999"
475+
},
476+
value: "#000000" // default color for other labels
477+
};
478+
encoding.axis.labelFont = {
479+
condition: {
480+
test: `datum.label == '${placeholder}'`,
481+
value: "italic"
482+
},
483+
value: "normal" // default font style for other labels
484+
};
485+
486+
// Add placeholder to domain
487+
if (!encoding.scale) {
488+
encoding.scale = {};
489+
}
490+
encoding.scale.domain = [...valuesToKeep, placeholder]
491+
}
492+
}
493+
}
494+
495+
return {...vgObj, data: {values: values}}
450496
}
451497

498+
499+
452500
export const adaptChart = (chart: Chart, targetTemplate: ChartTemplate) => {
453501

454502
let discardedChannels = Object.entries(chart.encodingMap).filter(([ch, enc]) => {

src/data/utils.ts

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33

44
import * as d3 from 'd3';
55
import Column from './column';
6+
import * as XLSX from 'xlsx';
67

78
import { DictTable } from '../components/ComponentType';
89
import { CoerceType, TestType, Type } from './types';
910
import { ColumnTable } from './table';
1011

11-
export const loadDataWrapper = (title: string, text: string, fileType: string): DictTable | undefined => {
12+
export const loadTextDataWrapper = (title: string, text: string, fileType: string): DictTable | undefined => {
1213

1314
let tableName = title;
1415
//let tableName = title.replace(/\.[^/.]+$/ , "");
@@ -18,8 +19,7 @@ export const loadDataWrapper = (title: string, text: string, fileType: string):
1819
table = createTableFromText(tableName, text);
1920
} else if (fileType == "application/json") {
2021
table = createTableFromFromObjectArray(tableName, JSON.parse(text));
21-
}
22-
22+
}
2323
return table;
2424
};
2525

@@ -43,7 +43,14 @@ export const createTableFromText = (title: string, text: string): DictTable | un
4343
// Should check the data file as well for the ending
4444
const isTabSeparated = tabNum / lineNum >= 1;
4545

46-
const values = isTabSeparated ? d3.tsvParse(text) : d3.csvParse(text);
46+
// Use d3.dsvFormat to create a custom parser that properly handles quoted fields
47+
// This ensures commas inside quoted fields won't be treated as delimiters
48+
const values = isTabSeparated
49+
? d3.tsvParse(text)
50+
: d3.dsvFormat(',').parse(text, row => {
51+
// Process each row to ensure proper type handling
52+
return row;
53+
});
4754

4855
return createTableFromFromObjectArray(title, values);
4956
};
@@ -145,20 +152,32 @@ export function tupleEqual(a: any[], b: any[]) {
145152
return true;
146153
}
147154

148-
// export function arrayEqual(_arr1: any[], _arr2: any[]) {
149-
// if (Array.isArray(_arr1) || !Array.isArray(_arr2) || _arr1.length !== _arr2.length) {
150-
// return false;
151-
// }
152-
153-
// // .concat() to not mutate arguments
154-
// const arr1 = _arr1.concat().sort();
155-
// const arr2 = _arr2.concat().sort();
156-
157-
// for (let i = 0; i < arr1.length; i++) {
158-
// if (arr1[i] !== arr2[i]) {
159-
// return false;
160-
// }
161-
// }
162-
163-
// return true;
164-
// }
155+
export const loadBinaryDataWrapper = (title: string, arrayBuffer: ArrayBuffer): DictTable[] => {
156+
try {
157+
// Read the Excel file
158+
const workbook = XLSX.read(arrayBuffer, { type: 'array' });
159+
160+
// Get all sheet names
161+
const sheetNames = workbook.SheetNames;
162+
163+
// Create tables for each sheet
164+
const tables: DictTable[] = [];
165+
166+
for (const sheetName of sheetNames) {
167+
// Get the worksheet
168+
const worksheet = workbook.Sheets[sheetName];
169+
170+
// Convert the worksheet to JSON
171+
const jsonData = XLSX.utils.sheet_to_json(worksheet);
172+
173+
// Create a table from the JSON data with sheet name included in the title
174+
const sheetTable = createTableFromFromObjectArray(`${title}-${sheetName}`, jsonData);
175+
tables.push(sheetTable);
176+
}
177+
178+
return tables;
179+
} catch (error) {
180+
console.error('Error processing Excel file:', error);
181+
return [];
182+
}
183+
};

0 commit comments

Comments
 (0)