Skip to content

Commit cf14c4c

Browse files
authored
Merge pull request #63 from NGO-Algorithm-Audit/feature/radio-buttons-remove-or-leave-NaN-values
Feature/radio buttons remove or leave na n values
2 parents 628c998 + fb89c66 commit cf14c4c

File tree

7 files changed

+99
-1
lines changed

7 files changed

+99
-1
lines changed

src/assets/synthetic-data.tsx

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ from js import isDemo
2828
from js import sdgMethod
2929
from js import samples
3030
from js import setOutputData
31+
from js import nanTreatment
3132
3233
3334
class GaussianCopulaSynthesizer:
@@ -77,7 +78,7 @@ def run():
7778
csv_data = StringIO(data)
7879
7980
admissions_df = pd.read_csv(csv_data, index_col=False)
80-
81+
print("nanTreatment:", nanTreatment)
8182
# admissions_sub = admissions_df[['sex', 'race1', 'ugpa', 'bar']]
8283
# real_data = admissions_sub.dropna()
8384
@@ -227,6 +228,18 @@ def run():
227228
if (sdgMethod == 'gc'):
228229
# Initialize synthesizer and fit it to the data
229230
synthesizer = GaussianCopulaSynthesizer()
231+
232+
# Handle NaN values based on the selected treatment method
233+
if nanTreatment == 'drop':
234+
df_imputed = df_imputed.dropna()
235+
elif nanTreatment == 'impute':
236+
# Use mean imputation for numerical columns and mode imputation for categorical columns
237+
for column in df_imputed.columns:
238+
if column_dtypes[column] == 'categorical':
239+
df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
240+
else:
241+
df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
242+
230243
synthesizer.fit(df_imputed)
231244
232245
# Generate synthetic data

src/components/SyntheticDataSettings.tsx

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ const createFormSchema = (t: (key: string) => string) =>
2727
required_error: t('syntheticData.form.errors.csvRequired'),
2828
}),
2929
sdgMethod: z.string(),
30+
nanTreatment: z.string(),
3031
});
3132

3233
interface DemoDataColumns {
@@ -56,6 +57,7 @@ export default function SyntheticDataSettings({
5657
resolver: zodResolver(FormSchema),
5758
defaultValues: {
5859
sdgMethod: 'cart',
60+
nanTreatment: 'drop',
5961
},
6062
});
6163
const [columnsCountError, setColumnsCountError] = useState(false);
@@ -117,6 +119,7 @@ export default function SyntheticDataSettings({
117119
isDemo: false,
118120
sdgMethod: data.sdgMethod,
119121
samples: outputSamples[0],
122+
nanTreatment: data.nanTreatment,
120123
});
121124
};
122125
return (
@@ -209,6 +212,68 @@ export default function SyntheticDataSettings({
209212
</RadioGroup>
210213
)}
211214
/>
215+
216+
{form.watch('sdgMethod') === 'gc' && (
217+
<div className="mt-4">
218+
<label className="text-sm font-medium flex flex-row items-center gap-1">
219+
{t(
220+
'syntheticData.form.fieldset.nanTreatment.title'
221+
)}
222+
<TooltipProvider>
223+
<Tooltip>
224+
<TooltipTrigger
225+
onClick={event => {
226+
event.preventDefault();
227+
}}
228+
>
229+
<InfoIcon className="size-3.5" />
230+
</TooltipTrigger>
231+
<TooltipContent>
232+
<div className="whitespace-pre-wrap max-w-full w-[400px] p-2">
233+
<Markdown className="-mt-2 text-gray-800 markdown">
234+
{t(
235+
'syntheticData.form.fieldset.nanTreatment.tooltip'
236+
)}
237+
</Markdown>
238+
</div>
239+
</TooltipContent>
240+
</Tooltip>
241+
</TooltipProvider>
242+
</label>
243+
<FormField
244+
control={form.control}
245+
name="nanTreatment"
246+
render={({ field }) => (
247+
<RadioGroup
248+
onValueChange={field.onChange}
249+
defaultValue={field.value}
250+
className="flex flex-col space-y-1 mt-2"
251+
>
252+
<FormItem className="flex items-center space-x-3 space-y-0">
253+
<FormControl>
254+
<RadioGroupItem value="drop" />
255+
</FormControl>
256+
<FormLabel className="font-normal">
257+
{t(
258+
'syntheticData.form.fieldset.nanTreatment.drop'
259+
)}
260+
</FormLabel>
261+
</FormItem>
262+
<FormItem className="flex items-center space-x-3 space-y-0">
263+
<FormControl>
264+
<RadioGroupItem value="impute" />
265+
</FormControl>
266+
<FormLabel className="font-normal">
267+
{t(
268+
'syntheticData.form.fieldset.nanTreatment.impute'
269+
)}
270+
</FormLabel>
271+
</FormItem>
272+
</RadioGroup>
273+
)}
274+
/>
275+
</div>
276+
)}
212277
</div>
213278

214279
<div className="grid gap-3">

src/components/synthetic-data-interfaces/SyntheticDataParameters.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ export interface SyntheticDataParameters {
22
isDemo: boolean;
33
sdgMethod: string;
44
samples: number;
5+
nanTreatment: string;
56
}

src/components/synthetic-data-interfaces/cluster-export.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
export interface SyntheticDataInfo {
22
isDemo: boolean;
33
sdgMethod: string;
4+
nanTreatment: string;
45
samples: number;
56
syntheticData: object;
67
date: Date;

src/locales/en.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,13 @@ export const en = {
110110
tooltip:
111111
'By default, the CART method is used to generate synthetic data. CART generally produces higher quality synthetic data, but might not work well on datasets with categorical variables with 20+ categories. Use Gaussian Copula in those cases.',
112112
},
113+
nanTreatment: {
114+
title: 'NaN Values Treatment',
115+
drop: 'Drop rows with NaN values',
116+
impute: 'Impute NaN values',
117+
tooltip:
118+
'When using Gaussian Copula, you can choose how to handle missing values (NaN values) in your dataset. Dropping rows with NaN values removes them completely, while imputation replaces them with mean values for numerical columns and mode values for categorical columns.',
119+
},
113120
samples: 'Number of synthetic datapoints',
114121
},
115122
actions: {

src/locales/nl.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,13 @@ export const nl = {
113113
tooltip:
114114
'In principe wordt de CART-methode gebruikt om synthetische data te genereren. CART levert over het algemeen synthetische data van hoge kwaliteit, maar werkt mogelijk niet goed bij datasets met categorische variabelen met meer dan 20 categorieën. Gebruik in die gevallen de Gaussian Copula.',
115115
},
116+
nanTreatment: {
117+
title: 'NaN Waarden Behandeling',
118+
drop: 'Verwijder rijen met NaN waarden',
119+
impute: 'Vervang NaN waarden',
120+
tooltip:
121+
'Bij gebruik van Gaussian Copula kunt u kiezen hoe u omgaat met ontbrekende waarden (NaN waarden) in uw dataset. Het verwijderen van rijen met NaN waarden verwijdert deze volledig, terwijl imputatie deze vervangt door gemiddelde waarden voor numerieke kolommen en modus waarden voor categorische kolommen.',
122+
},
116123
samples: 'Aantal synthetische datapunten',
117124
},
118125
actions: {

src/routes/SyntheticData.tsx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ export default function SyntheticDataGeneration() {
7373
isDemo: false,
7474
sdgMethod: 'gc',
7575
samples: 1000,
76+
nanTreatment: '',
7677
});
7778

7879
const params = new URLSearchParams(window.location.search);
@@ -110,6 +111,7 @@ export default function SyntheticDataGeneration() {
110111
isDemo: true,
111112
sdgMethod: 'cart',
112113
samples: 5000,
114+
nanTreatment: '',
113115
});
114116
}
115117
}, [initialised, data]);
@@ -118,6 +120,7 @@ export default function SyntheticDataGeneration() {
118120
isDemo: boolean;
119121
sdgMethod: string;
120122
samples: number;
123+
nanTreatment: string;
121124
}) => {
122125
runPython({
123126
type: 'start',
@@ -126,6 +129,7 @@ export default function SyntheticDataGeneration() {
126129
isDemo: props.isDemo,
127130
sdgMethod: props.sdgMethod,
128131
samples: props.samples,
132+
nanTreatment: props.nanTreatment,
129133
},
130134
},
131135
});

0 commit comments

Comments
 (0)