Skip to content

Commit 54a3bce

Browse files
committed
lots of tweaks and fixes to get synthetic data generation working for a v0
1 parent c2bfe0d commit 54a3bce

File tree

9 files changed

+22476
-96
lines changed

9 files changed

+22476
-96
lines changed

package-lock.json

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

public/Bar-Pass-Prediction.csv

Lines changed: 22408 additions & 0 deletions
Large diffs are not rendered by default.

src/assets/synthetic-data.tsx

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,9 @@ start = time.time()
1717
1818
from js import data
1919
from js import setResult
20-
from js import setMostBiasedCluster
21-
from js import setOtherClusters
22-
from js import iter
23-
from js import clusters
24-
from js import targetColumn
2520
from js import dataType
26-
from js import higherIsBetter
21+
from js import isDemo
22+
2723
2824
class GaussianCopulaSynthesizer:
2925
def __init__(self):
@@ -124,7 +120,17 @@ def run():
124120
admissions_sub = admissions_df[['sex', 'race', 'gpa']]
125121
real_data = admissions_sub.dropna()
126122
127-
print(real_data.head())
123+
if isDemo:
124+
setResult(json.dumps(
125+
{'type': 'heading', 'data': '''Demo'''}
126+
))
127+
setResult(json.dumps(
128+
{'type': 'text', 'data': '''A demo dataset is loaded below. We will now generate synthetic data on the columns: 'sex', 'gpa', 'race'. We will be using the Gaussian Copula method and evaluate the distribution and correlation differences between the real and synthetic data.'''}
129+
))
130+
131+
setResult(json.dumps(
132+
{'type': 'data-set-preview', 'data': ''}
133+
))
128134
129135
# Initialize synthesizer and fit it to the data
130136
synthesizer = GaussianCopulaSynthesizer()
@@ -139,16 +145,26 @@ def run():
139145
140146
results = run_diagnostic(real_data, synthetic_data, target_column='gpa')
141147
print('Results:', results)
142-
143148
setResult(json.dumps(
144-
{'type': 'heading', 'data': 'Parameters selected'}
149+
{'type': 'heading', 'data': 'Diagnostic Results:'}
145150
))
151+
setResult(json.dumps({'type': 'table', 'data': json.dumps([
152+
{
153+
'attribute': key,
154+
'ks_stat': values['ks_stat'],
155+
'p_value': values['p_value']
156+
}
157+
for key, values in results['distribution_results'].items()
158+
])}))
146159
147-
print('table output', json.dumps({'type': 'table', 'data': json.loads(synthetic_data.to_json(orient="records"))}))
160+
setResult(json.dumps(
161+
{'type': 'heading', 'data': 'Correlation difference: ' + str(results['correlation_diff']) }
162+
))
148163
149164
setResult(json.dumps(
150-
{'type': 'table', 'data': json.dumps({'type': 'table', 'data': json.loads(synthetic_data.to_json(orient="records"))})}
165+
{'type': 'heading', 'data': 'Output file:'}
151166
))
167+
setResult(json.dumps({'type': 'table', 'data': synthetic_data.to_json(orient="records")}))
152168
return
153169
154170

src/components/SimpleTable.tsx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ export default function SimpleTable({
1515
title?: string;
1616
data: Record<string, string | number>[];
1717
}) {
18+
// limit data to the first 100 rows.
19+
data = data.slice(0, 100);
20+
1821
return (
1922
<div className={`bg-white border border-gray-200 ${title && 'mb-4'}`}>
2023
<Table className={`text-xs ${title && 'mb-4'}`}>

src/components/SyntheticDataSettings.tsx

Lines changed: 6 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,14 @@
1-
import { Label } from '@/components/ui/label';
2-
import {
3-
Select,
4-
SelectContent,
5-
SelectItem,
6-
SelectTrigger,
7-
SelectValue,
8-
} from '@/components/ui/select';
9-
import { Slider } from '@/components/ui/slider';
10-
import { RadioGroup, RadioGroupItem } from '@/components/ui/radio-group';
11-
121
import CSVReader, { csvReader } from './CSVReader';
132
import { useEffect, useState } from 'react';
143
import { Button } from './ui/button';
154
import { ArrowDown, ArrowRight } from 'lucide-react';
165
import { z } from 'zod';
176
import { useForm } from 'react-hook-form';
187
import { zodResolver } from '@hookform/resolvers/zod';
19-
import { Form, FormControl, FormField, FormItem, FormLabel } from './ui/form';
8+
import { Form, FormField } from './ui/form';
209
import { Card, CardDescription, CardHeader, CardTitle } from './ui/card';
2110
import Papa from 'papaparse';
11+
import { SyntheticDataParameters } from './synthetic-data-interfaces/BiasDetectionParameters';
2212

2313
const FormSchema = z.object({
2414
file: z.string({
@@ -33,13 +23,7 @@ export default function BiasSettings({
3323
isErrorDuringAnalysis,
3424
isInitialised,
3525
}: {
36-
onRun: (
37-
clusterSize: number,
38-
iterations: number,
39-
targetColumn: string,
40-
dataType: string,
41-
higherIsBetter: boolean
42-
) => void;
26+
onRun: (params: SyntheticDataParameters) => void;
4327
onDataLoad: csvReader['onChange'];
4428
isLoading: boolean;
4529
isErrorDuringAnalysis: boolean;
@@ -48,10 +32,6 @@ export default function BiasSettings({
4832
const form = useForm<z.infer<typeof FormSchema>>({
4933
resolver: zodResolver(FormSchema),
5034
});
51-
const [iter, setIter] = useState([10]);
52-
const [clusters, setClusters] = useState([25]);
53-
54-
const [dataKey, setDataKey] = useState<string>(new Date().toISOString());
5535
const [data, setData] = useState<{
5636
data: Record<string, string>[];
5737
stringified: string;
@@ -69,18 +49,14 @@ export default function BiasSettings({
6949
form.setValue('file', stringified);
7050
}
7151
setData({ data, stringified, fileName });
72-
73-
const dataLength = (data?.length || 1000) / 10;
74-
setClusters([Math.round(dataLength / 4)]);
75-
setDataKey(new Date().toISOString());
7652
};
7753

7854
useEffect(() => {
7955
onDataLoad(data.data, data.stringified, data.fileName);
8056
}, [data]);
8157

8258
const onDemoRun = async () => {
83-
const file = await fetch('/FP-test-set.csv')
59+
const file = await fetch('/Bar-Pass-Prediction.csv')
8460
.then(response => response.text())
8561
.then(data => Papa.parse(data, { header: true }));
8662
onDataLoad(
@@ -91,14 +67,8 @@ export default function BiasSettings({
9167
);
9268
};
9369

94-
const onSubmit = (data: z.infer<typeof FormSchema>) => {
95-
onRun(
96-
clusters[0],
97-
iter[0],
98-
data.targetColumn,
99-
data.dataType,
100-
data.whichPerformanceMetricValueIsBetter === 'higher'
101-
);
70+
const onSubmit = () => {
71+
onRun({ dataType: 'numeric', isDemo: false });
10272
};
10373

10474
return (
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
export interface SyntheticDataParameters {
2+
dataType: string;
3+
isDemo: boolean;
4+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
export interface SyntheticDataInfo {
2+
dataType: string;
3+
isDemo: boolean;
4+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
export interface CSVData {
2+
data: Record<string, string>[];
3+
stringified: string;
4+
fileName: string;
5+
demo?: boolean;
6+
}

src/routes/SyntheticData.tsx

Lines changed: 12 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,12 @@ import { pythonCode } from '@/assets/synthetic-data';
44
import { usePython } from '@/components/pyodide/use-python';
55
import { Share } from 'lucide-react';
66
import { csvReader } from '@/components/CSVReader';
7-
import SimpleTable from '@/components/SimpleTable';
87
import { cn } from '@/lib/utils';
98
import ComponentMapper from '@/components/componentMapper';
10-
import { downloadFile } from '@/lib/download-file';
119
import { useReactToPrint } from 'react-to-print';
1210
import Measuring from '@/components/icons/measuring.svg?react';
1311
import SyntheticDataSettings from '@/components/SyntheticDataSettings';
12+
import { SyntheticDataInfo } from '@/components/synthetic-data-interfaces/cluster-export';
1413

1514
const PAGE_STYLE = `
1615
@page {
@@ -60,8 +59,10 @@ export default function SyntheticDataGeneration() {
6059
runPython,
6160
sendData,
6261
error,
63-
clusterInfo,
64-
} = usePython();
62+
} = usePython<SyntheticDataInfo, SyntheticDataInfo>({
63+
dataType: 'numeric',
64+
isDemo: false,
65+
});
6566

6667
const onFileLoad: csvReader['onChange'] = (
6768
data,
@@ -85,25 +86,18 @@ export default function SyntheticDataGeneration() {
8586
sendData(data.stringified);
8687
}
8788
if (data.demo) {
88-
onRun(3, 10, 'FP', 'numeric', false);
89+
onRun({ dataType: 'numeric', isDemo: true });
8990
}
9091
}, [initialised, data]);
9192

92-
const onRun = (
93-
clusterSize: number,
94-
iterations: number,
95-
targetColumn: string,
96-
dataType: string,
97-
higherIsBetter: boolean
98-
) => {
93+
const onRun = (props: { dataType: string; isDemo: boolean }) => {
9994
runPython({
10095
type: 'start',
10196
params: {
102-
iter: iterations,
103-
clusters: clusterSize,
104-
targetColumn: targetColumn,
105-
dataType: dataType,
106-
higherIsBetter: higherIsBetter,
97+
parameters: {
98+
dataType: props.dataType,
99+
isDemo: props.isDemo,
100+
},
107101
},
108102
});
109103
};
@@ -137,42 +131,11 @@ export default function SyntheticDataGeneration() {
137131
<Share className="size-3.5 mr-2" />
138132
Share
139133
</Button>
140-
{clusterInfo && (
141-
<Button
142-
variant="outline"
143-
size="sm"
144-
className="p-4 text-sm"
145-
onClick={() => {
146-
downloadFile(
147-
JSON.stringify(
148-
{
149-
fileName: data.fileName,
150-
...clusterInfo,
151-
},
152-
null,
153-
2
154-
),
155-
`${data.fileName.replace('.csv', '') || 'cluster-info'}-${clusterInfo.date.toISOString()}.json`,
156-
'application/json'
157-
);
158-
}}
159-
>
160-
<Share className="size-3.5 mr-2" />
161-
Export to .json
162-
</Button>
163-
)}
164134
</div>
165135
)}
166136

167-
{data.data.length > 0 && (
168-
<SimpleTable
169-
data={data.data.slice(0, 5)}
170-
title="Dataset preview showing the first 5 rows."
171-
/>
172-
)}
173-
174137
{result.length > 0 ? (
175-
<ComponentMapper items={result} />
138+
<ComponentMapper items={result} data={data} />
176139
) : data.data.length > 0 ? null : (
177140
<>
178141
<Measuring className="max-w-96 m-auto 2xl:max-w-full" />

0 commit comments

Comments
 (0)