Skip to content

Commit 8f7cc46

Browse files
committed
add multi select component and setup of basics of synthetic data to test python dependencies.
1 parent 3642378 commit 8f7cc46

File tree

10 files changed

+1744
-296
lines changed

10 files changed

+1744
-296
lines changed

package-lock.json

Lines changed: 843 additions & 124 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@
1717
"@heroicons/react": "^2.1.5",
1818
"@hookform/resolvers": "^3.9.0",
1919
"@radix-ui/react-checkbox": "^1.1.2",
20-
"@radix-ui/react-dialog": "^1.1.1",
20+
"@radix-ui/react-dialog": "^1.1.2",
2121
"@radix-ui/react-icons": "^1.3.0",
2222
"@radix-ui/react-label": "^2.1.0",
23+
"@radix-ui/react-popover": "^1.1.2",
2324
"@radix-ui/react-radio-group": "^1.2.1",
2425
"@radix-ui/react-select": "^2.1.1",
26+
"@radix-ui/react-separator": "^1.1.0",
2527
"@radix-ui/react-slider": "^1.2.0",
2628
"@radix-ui/react-slot": "^1.1.0",
2729
"@radix-ui/react-tooltip": "^1.1.2",
@@ -30,6 +32,7 @@
3032
"ace-builds": "^1.35.4",
3133
"class-variance-authority": "^0.7.0",
3234
"clsx": "^2.1.1",
35+
"cmdk": "^1.0.0",
3336
"d3": "^7.9.0",
3437
"lucide-react": "^0.436.0",
3538
"papaparse": "^5.4.1",

src/assets/synthetic-data.tsx

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
export const pythonCode = `
2+
import random
3+
import json
4+
import pandas as pd
5+
import numpy as np
6+
import warnings
7+
import scipy.stats as stats
8+
from scipy.stats import norm, ks_2samp
9+
10+
warnings.filterwarnings('ignore')
11+
12+
from io import StringIO
13+
from unsupervised_bias_detection.clustering import BiasAwareHierarchicalKModes
14+
from unsupervised_bias_detection.clustering import BiasAwareHierarchicalKMeans
15+
import time
16+
start = time.time()
17+
18+
from js import data
19+
from js import setResult
20+
from js import setMostBiasedCluster
21+
from js import setOtherClusters
22+
from js import iter
23+
from js import clusters
24+
from js import targetColumn
25+
from js import dataType
26+
from js import higherIsBetter
27+
28+
class GaussianCopulaSynthesizer:
29+
def __init__(self):
30+
self.means = None
31+
self.cov_matrix = None
32+
self.scaler = None
33+
self.data_marginals = None
34+
35+
def fit(self, data):
36+
"""
37+
Fit the Gaussian Copula model to the given data.
38+
"""
39+
# Step 1: Store data marginals (quantiles for each feature)
40+
self.data_marginals = []
41+
for col in data.columns:
42+
sorted_data = np.sort(data[col])
43+
quantiles = np.linspace(0, 1, len(sorted_data))
44+
self.data_marginals.append((sorted_data, quantiles, col))
45+
46+
# Step 2: Convert data to normal distribution using CDF (Gaussianization)
47+
uniform_data = data.rank(pct=True) # Get percentile rank for each column (empirical CDF)
48+
gaussian_data = norm.ppf(uniform_data) # Convert uniform to standard normal
49+
50+
# Step 3: Fit a multivariate Gaussian to the normalized data
51+
self.means = gaussian_data.mean(axis=0)
52+
self.cov_matrix = np.cov(gaussian_data, rowvar=False)
53+
54+
def sample(self, n_samples):
55+
"""
56+
Generate synthetic data using the fitted Gaussian Copula model.
57+
"""
58+
# Step 1: Sample from the multivariate normal distribution
59+
synthetic_gaussian = np.random.multivariate_normal(self.means, self.cov_matrix, n_samples)
60+
61+
# Step 2: Convert back to uniform distribution using CDF (normal -> uniform)
62+
synthetic_uniform = norm.cdf(synthetic_gaussian)
63+
64+
# Step 3: Map uniform data back to the original marginals
65+
synthetic_data = pd.DataFrame()
66+
for i, (sorted_data, quantiles, col) in enumerate(self.data_marginals):
67+
synthetic_data[col] = np.interp(synthetic_uniform[:, i], quantiles, sorted_data)
68+
69+
return synthetic_data
70+
71+
72+
def evaluate_distribution(real_data, synthetic_data):
73+
"""
74+
Compare the distribution of each column in the real and synthetic data using
75+
the Kolmogorov-Smirnov (KS) test.
76+
"""
77+
results = {}
78+
for column in real_data.columns:
79+
real_col = real_data[column].dropna()
80+
synthetic_col = synthetic_data[column].dropna()
81+
82+
# Perform the KS test
83+
ks_stat, p_value = ks_2samp(real_col, synthetic_col)
84+
85+
# Store the result
86+
results[column] = {'ks_stat': ks_stat, 'p_value': p_value}
87+
return results
88+
89+
def evaluate_correlations(real_data, synthetic_data):
90+
"""
91+
Compare the pairwise correlation matrices of the real and synthetic data.
92+
"""
93+
real_corr = real_data.corr()
94+
synthetic_corr = synthetic_data.corr()
95+
96+
# Compute the difference between the correlation matrices
97+
corr_diff = np.abs(real_corr - synthetic_corr)
98+
return corr_diff.mean().mean() # Average correlation difference
99+
100+
def run_diagnostic(real_data, synthetic_data, target_column):
101+
"""
102+
Run diagnostics on synthetic data by evaluating distribution, correlations, and
103+
classification model performance.
104+
"""
105+
# Step 1: Evaluate distributions
106+
distribution_results = evaluate_distribution(real_data, synthetic_data)
107+
108+
# Step 2: Evaluate correlations
109+
correlation_diff = evaluate_correlations(real_data, synthetic_data)
110+
111+
# Aggregate results
112+
diagnostics = {
113+
'distribution_results': distribution_results,
114+
'correlation_diff': correlation_diff
115+
}
116+
117+
return diagnostics
118+
119+
def run():
120+
csv_data = StringIO(data)
121+
122+
admissions_df = pd.read_csv(csv_data)
123+
124+
admissions_sub = admissions_df[['sex', 'race', 'gpa']]
125+
real_data = admissions_sub.dropna()
126+
127+
print(real_data.head())
128+
129+
# Initialize synthesizer and fit it to the data
130+
synthesizer = GaussianCopulaSynthesizer()
131+
synthesizer.fit(real_data)
132+
133+
# Generate synthetic data
134+
synthetic_data = synthesizer.sample(1000)
135+
136+
# Output some results
137+
print("Original Data (first 5 rows):", real_data.head())
138+
print("Synthetic Data (first 5 rows):", synthetic_data.head())
139+
140+
results = run_diagnostic(real_data, synthetic_data, target_column='gpa')
141+
print('Results:', results)
142+
143+
setResult(json.dumps(
144+
{'type': 'heading', 'data': 'Parameters selected'}
145+
))
146+
147+
print('table output', json.dumps({'type': 'table', 'data': json.loads(synthetic_data.to_json(orient="records"))}))
148+
149+
setResult(json.dumps(
150+
{'type': 'table', 'data': json.dumps({'type': 'table', 'data': json.loads(synthetic_data.to_json(orient="records"))})}
151+
))
152+
return
153+
154+
155+
if data != 'INIT':
156+
run()
157+
`;

src/components/SyntheticDataSettings.tsx

Lines changed: 1 addition & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,6 @@ const FormSchema = z.object({
2424
file: z.string({
2525
required_error: 'Please upload a CSV file.',
2626
}),
27-
whichPerformanceMetricValueIsBetter: z.string(),
28-
targetColumn: z
29-
.string({
30-
required_error: 'Please select a target column.',
31-
})
32-
.nonempty(),
33-
dataType: z
34-
.string({
35-
required_error: 'Please select a data type.',
36-
})
37-
.nonempty(),
3827
});
3928

4029
export default function BiasSettings({
@@ -58,10 +47,6 @@ export default function BiasSettings({
5847
}) {
5948
const form = useForm<z.infer<typeof FormSchema>>({
6049
resolver: zodResolver(FormSchema),
61-
defaultValues: {
62-
dataType: 'numeric',
63-
whichPerformanceMetricValueIsBetter: 'lower',
64-
},
6550
});
6651
const [iter, setIter] = useState([10]);
6752
const [clusters, setClusters] = useState([25]);
@@ -121,7 +106,7 @@ export default function BiasSettings({
121106
<div className="h-auto md:h-full flex flex-col justify-between">
122107
<form
123108
onSubmit={form.handleSubmit(onSubmit)}
124-
className="grid w-full items-start gap-2 -mt-2 grid-cols-1 sm:gap-4 sm:grid-cols-2"
109+
className="grid w-full items-start gap-2 -mt-2 grid-cols-1"
125110
>
126111
<fieldset className="grid gap-6 rounded-lg border p-4">
127112
<legend className="-ml-1 px-1 text-sm font-medium">
@@ -136,161 +121,8 @@ export default function BiasSettings({
136121
)}
137122
/>
138123
</div>
139-
<div className="grid gap-3">
140-
<FormField
141-
control={form.control}
142-
name="targetColumn"
143-
render={({ field }) => (
144-
<FormItem>
145-
<FormLabel>
146-
Performance metric column
147-
</FormLabel>
148-
<Select
149-
onValueChange={field.onChange}
150-
key={`${dataKey}_select`}
151-
>
152-
<FormControl>
153-
<SelectTrigger>
154-
<SelectValue placeholder="Select a column" />
155-
</SelectTrigger>
156-
</FormControl>
157-
<SelectContent>
158-
{data.data?.[0] ? (
159-
Object.keys(
160-
data.data?.[0] ?? {}
161-
)
162-
.filter(
163-
column => column
164-
)
165-
.map(column => (
166-
<SelectItem
167-
key={`${dataKey}${column}`}
168-
value={column}
169-
>
170-
{column}
171-
</SelectItem>
172-
))
173-
) : (
174-
<SelectItem
175-
value="noData"
176-
disabled
177-
>
178-
No data loaded
179-
</SelectItem>
180-
)}
181-
</SelectContent>
182-
</Select>
183-
</FormItem>
184-
)}
185-
/>
186-
</div>
187-
<div className="grid gap-3">
188-
<FormField
189-
control={form.control}
190-
name="dataType"
191-
render={({ field }) => (
192-
<FormItem>
193-
<FormLabel>Data type</FormLabel>
194-
<Select
195-
defaultValue="numeric"
196-
onValueChange={field.onChange}
197-
key={`${dataKey}_dataType`}
198-
>
199-
<FormControl>
200-
<SelectTrigger>
201-
<SelectValue placeholder="Select dataType" />
202-
</SelectTrigger>
203-
</FormControl>
204-
<SelectContent>
205-
<SelectItem
206-
key="numeric"
207-
value="numeric"
208-
>
209-
Numeric
210-
</SelectItem>
211-
<SelectItem
212-
key="categorical"
213-
value="categorical"
214-
>
215-
Categorical
216-
</SelectItem>
217-
</SelectContent>
218-
</Select>
219-
</FormItem>
220-
)}
221-
/>
222-
</div>
223124
</fieldset>
224125

225-
{/* <fieldset className="grid gap-6 rounded-lg border p-4">
226-
<legend className="-ml-1 px-1 text-sm font-medium">
227-
Parameters
228-
</legend>
229-
<div className="grid gap-3">
230-
<Label htmlFor="iterations">
231-
Iterations ({iter})
232-
</Label>
233-
<Slider
234-
id="iterations"
235-
defaultValue={iter}
236-
max={100}
237-
step={1}
238-
onValueChange={value => setIter(value)}
239-
className="cursor-pointer"
240-
/>
241-
</div>
242-
<div className="grid gap-3">
243-
<Label htmlFor="min-cluster-size">
244-
Minimal cluster size ({clusters})
245-
</Label>
246-
<Slider
247-
id="min-cluster-size"
248-
defaultValue={clusters}
249-
key={`${dataKey}_clusters`}
250-
max={Math.floor(
251-
(data?.data?.length || 1000) / 10
252-
)}
253-
step={1}
254-
onValueChange={value => setClusters(value)}
255-
className="cursor-pointer"
256-
/>
257-
</div>
258-
<div className="flex flex-row gap-3">
259-
<FormField
260-
control={form.control}
261-
name="whichPerformanceMetricValueIsBetter"
262-
render={({ field }) => (
263-
<RadioGroup
264-
onValueChange={field.onChange}
265-
defaultValue={field.value}
266-
key={`${dataKey}_whichPerformanceMetricValueIsBetter`}
267-
className="flex flex-col space-y-1"
268-
>
269-
<FormItem className="flex items-center space-x-3 space-y-0">
270-
<FormControl>
271-
<RadioGroupItem value="lower" />
272-
</FormControl>
273-
<FormLabel className="font-normal">
274-
Lower value of performance
275-
metric is better, e.g., error
276-
rate
277-
</FormLabel>
278-
</FormItem>
279-
<FormItem className="flex items-center space-x-3 space-y-0">
280-
<FormControl>
281-
<RadioGroupItem value="higher" />
282-
</FormControl>
283-
<FormLabel className="font-normal">
284-
Higher value of performance
285-
metric is better, e.g., accuracy
286-
</FormLabel>
287-
</FormItem>
288-
</RadioGroup>
289-
)}
290-
></FormField>
291-
</div>
292-
</fieldset> */}
293-
294126
<div className="flex flex-row ml-auto gap-2">
295127
{isErrorDuringAnalysis && (
296128
<div className="text-red-500">

0 commit comments

Comments
 (0)