Skip to content

Commit e48e8a2

Browse files
authored
Merge pull request #66 from NGO-Algorithm-Audit/feature/ui-texts
Feature/UI texts
2 parents 13d990d + c1c2265 commit e48e8a2

File tree

4 files changed

+69
-24
lines changed

4 files changed

+69
-24
lines changed

src/assets/synthetic-data.tsx

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import warnings
77
import scipy.stats as stats
88
from scipy.stats import norm, ks_2samp
99
from sklearn.preprocessing import LabelEncoder
10-
from synthpop import MissingDataHandler, DataProcessor, CARTMethod
10+
from synthpop import MissingDataHandler, DataProcessor, CARTMethod, GaussianCopulaMethod
1111
from synthpop.metrics import (
1212
MetricsReport,
1313
EfficacyMetrics,
@@ -177,11 +177,16 @@ def run():
177177
# Preprocess the data: transforms raw data into a numerical format
178178
processed_data = processor.preprocess(df_imputed)
179179
180-
cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
181-
cart.fit(processed_data)
182-
183-
synthetic_processed = cart.sample(samples)
184-
180+
if (sdgMethod == 'cart'):
181+
cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
182+
cart.fit(processed_data)
183+
synthetic_processed = cart.sample(samples)
184+
185+
if (sdgMethod == 'gc'):
186+
gc = GaussianCopulaMethod(metadata)
187+
gc.fit(processed_data)
188+
synthetic_processed = gc.sample(samples)
189+
185190
print("synthetic_processed (first 5 rows):", synthetic_processed.head())
186191
187192
synthetic_data = processor.postprocess(synthetic_processed)
@@ -225,25 +230,25 @@ def run():
225230
# spop.fit(real_data, dtypes=dtypes_dict)
226231
# synthetic_data = spop.generate(k=samples)
227232
228-
if (sdgMethod == 'gc'):
233+
# if (sdgMethod == 'gc'):
229234
# Initialize synthesizer and fit it to the data
230-
synthesizer = GaussianCopulaSynthesizer()
235+
# synthesizer = GaussianCopulaSynthesizer()
231236
232237
# Handle NaN values based on the selected treatment method
233-
if nanTreatment == 'drop':
234-
df_imputed = df_imputed.dropna()
235-
elif nanTreatment == 'impute':
238+
# if nanTreatment == 'drop':
239+
# df_imputed = df_imputed.dropna()
240+
# elif nanTreatment == 'impute':
236241
# Use mean imputation for numerical columns and mode imputation for categorical columns
237-
for column in df_imputed.columns:
238-
if column_dtypes[column] == 'categorical':
239-
df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
240-
else:
241-
df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
242+
# for column in df_imputed.columns:
243+
# if column_dtypes[column] == 'categorical':
244+
# df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
245+
# else:
246+
# df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
242247
243-
synthesizer.fit(df_imputed)
248+
#synthesizer.fit(df_imputed)
244249
245250
# Generate synthetic data
246-
synthetic_data = synthesizer.sample(samples)
251+
# synthetic_data = synthesizer.sample(samples)
247252
248253
synth_df_decoded = synthetic_data.copy()
249254
@@ -358,16 +363,24 @@ def run():
358363
{
359364
'reportType': 'heading2',
360365
'headingKey': 'syntheticData.diagnosticsReportTitle'
361-
},
366+
},
362367
{
363368
'reportType': 'table',
364369
'titleKey': 'syntheticData.diagnosticsTitle',
365-
'showIndex' : False,
370+
'showIndex' : False,
371+
'preContent' : [{
372+
'contentType': 'text',
373+
'textKey': 'syntheticData.diagnosticsReportDescription'
374+
}],
366375
'data': report_df.to_json(orient="records"),
367376
},
368377
{
369378
'reportType': 'correlationSyntheticData',
370-
'titleKey': 'syntheticData.correlationMatrixTitle',
379+
'titleKey': 'syntheticData.correlationMatrixTitle',
380+
'preContent' : [{
381+
'contentType': 'text',
382+
'textKey': 'syntheticData.correlationMatrixDescription'
383+
}],
371384
},
372385
{
373386
'reportType': 'table',

src/components/DistributionReport.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ export const DistributionReport = (
137137
return null;
138138
}
139139
const preContent: additionalContent = report.preContent
140-
? JSON.parse(report.preContent)
140+
? (report.preContent as unknown as additionalContent)
141141
: [];
142142
const postContent: additionalContent =
143143
(report.postContent as unknown as additionalContent) ??
@@ -255,7 +255,7 @@ export const DistributionReport = (
255255
return null;
256256
}
257257
const preContent: additionalContent = report.preContent
258-
? JSON.parse(report.preContent)
258+
? (report.preContent as unknown as additionalContent)
259259
: [];
260260
const postContent: additionalContent =
261261
(report.postContent as unknown as additionalContent) ??

src/components/componentMapper.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ export default function ComponentMapper({
128128

129129
case 'text':
130130
return (
131-
<TooltipProvider>
131+
<TooltipProvider key={index}>
132132
<MarkdownWithTooltips
133133
key={index}
134134
className="-mt-2 text-gray-800 markdown"

src/locales/en.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,39 @@ export const en = {
154154
distributionsTitle: '4.1 Distributions',
155155
diagnosticsReportTitle: '4.2. Diagnostic Report',
156156
diagnosticsTitle: 'Diagnostic Results',
157+
diagnosticsReportDescription: `For each column, diagnostic results are computed for the quality of the generated synthetic data. The computed metrics depend on the type of data.
158+
159+
For numerical (or datetime) columns the following metrics are computed:
160+
- {tooltip:syntheticData.missingValueSimilarity}Missing value similarity{/tooltip}
161+
- {tooltip:syntheticData.rangeCoverage}Range coverage{/tooltip}
162+
- {tooltip:syntheticData.boundaryAdherenc}Boundary adherence{/tooltip}
163+
- {tooltip:syntheticData.statisticSimilarity}Statistic similarity{/tooltip}
164+
- {tooltip:syntheticData.kolmogorovSmirnovComplement}Kolmogorov–Smirnov (KS) complement{/tooltip}
165+
166+
For categorical (or boolean) columns the following metrics are computed:
167+
- {tooltip:syntheticData.missingValueSimilarity}Missing value similarity{/tooltip}
168+
- {tooltip:syntheticData.categoryCoverage}Category coverage{/tooltip}
169+
- {tooltip:syntheticData.categoryAdherence}Category adherence{/tooltip}
170+
- {tooltip:syntheticData.totalVariationComplement}Total variation (TV) complement{/tooltip}`,
171+
missingValueSimilarity:
172+
'Compares whether the synthetic data has the same proportion of missing values as the real data for a given column',
173+
rangeCoverage:
174+
'Measures whether a synthetic column covers the full range of values that are present in a real column',
175+
boundaryAdherenc:
176+
'Measures whether a synthetic column respects the minimum and maximum values of the real column. It returns the percentage of synthetic rows that adhere to the real boundaries',
177+
statisticSimilarity:
178+
'Measures the similarity between real column and a synthetic column by comparing the mean, standard deviation and median',
179+
kolmogorovSmirnovComplement:
180+
'Computes the similarity of a real and synthetic numerical column in terms of the column shapes, i.e., the marginal distribution or 1D histogram of the column.',
181+
categoryCoverage:
182+
'Measures whether a synthetic column covers all the possible categories that are present in a real column',
183+
categoryAdherence:
184+
'Measures whether a synthetic column adheres to the same category values as the real data',
185+
totalVariationComplement:
186+
'Computes the similarity of a real and synthetic categorical column in terms of the column shapes, i.e., the marginal distribution or 1D histogram of the column.',
157187
correlationMatrixTitle: 'Correlation matrix',
188+
correlationMatrixDescription: `The matrix below illustrates the differences in pairwise correlations between variables in the original and synthetic data.
189+
Green cells signify that the pairwise correlation was accurately captured, with 0 representing the best possible score. Red cells indicate poor capture of the pairwise correlation.`,
158190
efficacyMetricsTitle: 'Efficacy metrics',
159191
disclosureProtectionTitle: 'Privacy metrics',
160192
outputDataTitle: '5. Generated synthetic data',

0 commit comments

Comments
 (0)