Skip to content

Commit 9025b77

Browse files
authored
Merge pull request #52 from NGO-Algorithm-Audit/feature/fix-imputed
use imputed instead of uploaded data for calculations
2 parents d96c51f + 0373abd commit 9025b77

File tree

2 files changed

+32
-85
lines changed

2 files changed

+32
-85
lines changed

src/assets/synthetic-data.tsx

Lines changed: 30 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -72,54 +72,6 @@ class GaussianCopulaSynthesizer:
7272
synthetic_data[col] = np.interp(synthetic_uniform[:, i], quantiles, sorted_data)
7373
7474
return synthetic_data
75-
76-
77-
def evaluate_distribution(real_data, synthetic_data):
78-
"""
79-
Compare the distribution of each column in the real and synthetic data using
80-
the Kolmogorov-Smirnov (KS) test.
81-
"""
82-
results = {}
83-
for column in real_data.columns:
84-
real_col = real_data[column].dropna()
85-
synthetic_col = synthetic_data[column].dropna()
86-
87-
# Perform the KS test
88-
ks_stat, p_value = ks_2samp(real_col, synthetic_col)
89-
90-
# Store the result
91-
results[column] = {'ks_stat': ks_stat, 'p_value': p_value}
92-
return results
93-
94-
def evaluate_correlations(real_data, synthetic_data):
95-
"""
96-
Compare the pairwise correlation matrices of the real and synthetic data.
97-
"""
98-
real_corr = real_data.corr()
99-
synthetic_corr = synthetic_data.corr()
100-
101-
# Compute the difference between the correlation matrices
102-
corr_diff = np.abs(real_corr - synthetic_corr)
103-
return corr_diff.mean().mean() # Average correlation difference
104-
105-
def run_diagnostic(real_data, synthetic_data, target_column):
106-
"""
107-
Run diagnostics on synthetic data by evaluating distribution, correlations, and
108-
classification model performance.
109-
"""
110-
# Step 1: Evaluate distributions
111-
distribution_results = evaluate_distribution(real_data, synthetic_data)
112-
113-
# Step 2: Evaluate correlations
114-
correlation_diff = evaluate_correlations(real_data, synthetic_data)
115-
116-
# Aggregate results
117-
diagnostics = {
118-
'distribution_results': distribution_results,
119-
'correlation_diff': correlation_diff
120-
}
121-
122-
return diagnostics
12375
12476
def run():
12577
csv_data = StringIO(data)
@@ -219,7 +171,7 @@ def run():
219171
processor = DataProcessor(metadata)
220172
221173
# Preprocess the data: transforms raw data into a numerical format
222-
processed_data = processor.preprocess(real_data)
174+
processed_data = processor.preprocess(df_imputed)
223175
224176
cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
225177
cart.fit(processed_data)
@@ -235,10 +187,6 @@ def run():
235187
# categorical
236188
237189
print("Synthetic Data (first 5 rows):", synthetic_data.head())
238-
239-
240-
# dtypes_dict = real_data.dtypes.to_dict()
241-
# dtypes_dict = {k: 'float' if (v == 'float64' or v == 'int64') else 'category' if (v == 'O' or v =='bool') else v for k, v in dtypes_dict.items()}
242190
243191
244192
setResult(json.dumps({
@@ -249,7 +197,7 @@ def run():
249197
250198
251199
dataInfo = []
252-
for column in real_data.columns:
200+
for column in df_imputed.columns:
253201
dataInfo.append({
254202
'key': column,
255203
'value': column_dtypes[column]
@@ -265,7 +213,7 @@ def run():
265213
'key': 'syntheticData.columnsInDatasetInfo'
266214
}))
267215
268-
cloned_real_data = real_data.copy()
216+
cloned_real_data = df_imputed.copy()
269217
270218
# if (sdgMethod == 'cart'):
271219
# spop = Synthpop(method='cart')
@@ -276,15 +224,15 @@ def run():
276224
if (sdgMethod == 'gc'):
277225
# Initialize synthesizer and fit it to the data
278226
synthesizer = GaussianCopulaSynthesizer()
279-
synthesizer.fit(real_data)
227+
synthesizer.fit(df_imputed)
280228
281229
# Generate synthetic data
282230
synthetic_data = synthesizer.sample(samples)
283231
284232
synth_df_decoded = synthetic_data.copy()
285233
286234
# Convert categorical variables to numerical values
287-
df_encoded = real_data.copy()
235+
df_encoded = df_imputed.copy()
288236
synth_df_encoded = synthetic_data.copy()
289237
290238
for column in column_dtypes:
@@ -293,45 +241,45 @@ def run():
293241
synth_df_encoded[column] = synth_df_encoded[column].astype('category').cat.codes
294242
295243
# Output some results
296-
print("Original Data (first 5 rows):", real_data.head())
244+
print("Original Data (first 5 rows):", df_imputed.head())
297245
print("Synthetic Data (first 5 rows):", synthetic_data.head())
298246
299247
print("Synthetic Data decoded (first 5 rows):", synth_df_decoded.head())
300248
301249
# Store synthetic data for export
302250
setOutputData("syntheticData", synthetic_data.to_json(orient='records'))
303251
304-
report = MetricsReport(real_data, synthetic_data, metadata)
252+
report = MetricsReport(df_imputed, synthetic_data, metadata)
305253
report_df = report.generate_report()
306254
print('report_df:', report_df)
307255
308256
# combine empty synthetic data with original data and with encoded data
309-
combined_data = pd.concat((real_data.assign(realOrSynthetic='real'), synthetic_data.assign(realOrSynthetic='synthetic')), keys=['real','synthetic'], names=['Data'])
310-
311-
# for column in column_dtypes:
312-
# if column_dtypes[column] == 'categorical':
313-
# reg_efficacy = EfficacyMetrics(task='classification', target_column=column)
314-
# reg_metrics = reg_efficacy.evaluate(real_data, synthetic_data)
315-
# print("=== Regression Efficacy Metrics ===", column)
316-
# print(reg_metrics)
317-
# else:
318-
# reg_efficacy = EfficacyMetrics(task='regression', target_column=column)
319-
# reg_metrics = reg_efficacy.evaluate(real_data, synthetic_data)
320-
# print("=== Regression Efficacy Metrics ===", column)
321-
# print(reg_metrics)
322-
323-
reg_efficacy = EfficacyMetrics(task='regression', target_column="ugpa")
324-
reg_metrics = reg_efficacy.evaluate(real_data, synthetic_data)
325-
print("=== Regression Efficacy Metrics === UGPA")
326-
print(reg_metrics)
257+
combined_data = pd.concat((df_imputed.assign(realOrSynthetic='real'), synthetic_data.assign(realOrSynthetic='synthetic')), keys=['real','synthetic'], names=['Data'])
258+
259+
for column in column_dtypes:
260+
if column_dtypes[column] == 'categorical':
261+
reg_efficacy = EfficacyMetrics(task='classification', target_column=column)
262+
reg_metrics = reg_efficacy.evaluate(df_imputed, synthetic_data)
263+
print("=== Regression Efficacy Metrics ===", column)
264+
print(reg_metrics)
265+
else:
266+
reg_efficacy = EfficacyMetrics(task='regression', target_column=column)
267+
reg_metrics = reg_efficacy.evaluate(df_imputed, synthetic_data)
268+
print("=== Regression Efficacy Metrics ===", column)
269+
print(reg_metrics)
270+
271+
# reg_efficacy = EfficacyMetrics(task='regression', target_column="ugpa")
272+
# reg_metrics = reg_efficacy.evaluate(df_imputed, synthetic_data)
273+
# print("=== Regression Efficacy Metrics === UGPA")
274+
# print(reg_metrics)
327275
328276
clf_efficacy = EfficacyMetrics(task='classification', target_column="bar")
329-
clf_metrics = clf_efficacy.evaluate(real_data, synthetic_data)
277+
clf_metrics = clf_efficacy.evaluate(df_imputed, synthetic_data)
330278
print("=== Classification Efficacy Metrics === BAR")
331279
print(clf_metrics)
332280
333281
334-
dp = DisclosureProtection(real_data, synthetic_data)
282+
dp = DisclosureProtection(df_imputed, synthetic_data)
335283
dp_score = dp.score()
336284
dp_report = dp.report()
337285
@@ -367,9 +315,9 @@ def run():
367315
'titleKey': 'syntheticData.diagnosticsTitle',
368316
'showIndex' : False,
369317
'data': report_df.to_json(orient="records"),
370-
'postContent': json.dumps([{
318+
'postContent': [{
371319
'contentType' : 'correlationSyntheticData'
372-
}])
320+
}]
373321
},
374322
{'reportType': 'bivariateDistributionSyntheticData'}
375323
]
@@ -400,7 +348,7 @@ if data != 'INIT':
400348

401349
/*
402350
403-
# df_numeric = real_data.apply(pd.to_numeric, errors='coerce')
351+
# df_numeric = df_imputed.apply(pd.to_numeric, errors='coerce')
404352
# synth_df_numeric = synthetic_data.apply(pd.to_numeric, errors='coerce')
405353
406354
# 'syntheticCorrelations': np.abs(df_numeric.corr() - synth_df_numeric.corr()).to_json(orient="records"),

src/components/DistributionReport.tsx

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,9 +126,8 @@ export const DistributionReport = (
126126
? JSON.parse(report.preContent)
127127
: [];
128128
const postContent: additionalContent =
129-
report.postContent
130-
? JSON.parse(report.postContent)
131-
: [];
129+
(report.postContent as unknown as additionalContent) ??
130+
[];
132131

133132
return (
134133
<div key={indexReport} className="mb-4">

0 commit comments

Comments
 (0)