Skip to content

Commit 0c7ff99

Browse files
authored
Merge pull request #50 from NGO-Algorithm-Audit/feature/fix-heatmaps-and-update-python-synthpop
Refactor synthetic data processing and enhance efficacy metrics evalu…
2 parents 7abdd77 + f5c52a6 commit 0c7ff99

File tree

4 files changed

+40
-24
lines changed

4 files changed

+40
-24
lines changed
544 KB
Binary file not shown.

src/assets/synthetic-data.tsx

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -266,10 +266,6 @@ def run():
266266
}))
267267
268268
cloned_real_data = real_data.copy()
269-
# label_encoders = {}
270-
# for column in real_data.select_dtypes(include=['object']).columns:
271-
# label_encoders[column] = LabelEncoder()
272-
# real_data[column] = label_encoders[column].fit_transform(real_data[column])
273269
274270
# if (sdgMethod == 'cart'):
275271
# spop = Synthpop(method='cart')
@@ -286,9 +282,6 @@ def run():
286282
synthetic_data = synthesizer.sample(samples)
287283
288284
synth_df_decoded = synthetic_data.copy()
289-
# for column in synth_df_decoded.columns:
290-
# if column in label_encoders:
291-
# synth_df_decoded[column] = label_encoders[column].inverse_transform(synth_df_decoded[column])
292285
293286
# Convert categorical variables to numerical values
294287
df_encoded = real_data.copy()
@@ -298,15 +291,6 @@ def run():
298291
if column_dtypes[column] == 'categorical':
299292
df_encoded[column] = df_encoded[column].astype('category').cat.codes
300293
synth_df_encoded[column] = synth_df_encoded[column].astype('category').cat.codes
301-
302-
# df_encoded['sex'] = df_encoded['sex'].astype('category').cat.codes
303-
# df_encoded['race1'] = df_encoded['race1'].astype('category').cat.codes
304-
# df_encoded['bar'] = df_encoded['bar'].astype('category').cat.codes
305-
306-
# synth_df_encoded = synthetic_data.copy()
307-
# synth_df_encoded['sex'] = synth_df_encoded['sex'].astype('category').cat.codes
308-
# synth_df_encoded['race1'] = synth_df_encoded['race1'].astype('category').cat.codes
309-
# synth_df_encoded['bar'] = synth_df_encoded['bar'].astype('category').cat.codes
310294
311295
# Output some results
312296
print("Original Data (first 5 rows):", real_data.head())
@@ -317,16 +301,45 @@ def run():
317301
# Store synthetic data for export
318302
setOutputData("syntheticData", synthetic_data.to_json(orient='records'))
319303
320-
# results = run_diagnostic(real_data, synthetic_data, target_column='gpa')
321-
# print('Results:', results)
322-
323304
report = MetricsReport(real_data, synthetic_data, metadata)
324305
report_df = report.generate_report()
325306
print('report_df:', report_df)
326307
327308
# combine empty synthetic data with original data and with encoded data
328309
combined_data = pd.concat((real_data.assign(realOrSynthetic='real'), synthetic_data.assign(realOrSynthetic='synthetic')), keys=['real','synthetic'], names=['Data'])
329310
311+
# for column in column_dtypes:
312+
# if column_dtypes[column] == 'categorical':
313+
# reg_efficacy = EfficacyMetrics(task='classification', target_column=column)
314+
# reg_metrics = reg_efficacy.evaluate(real_data, synthetic_data)
315+
# print("=== Regression Efficacy Metrics ===", column)
316+
# print(reg_metrics)
317+
# else:
318+
# reg_efficacy = EfficacyMetrics(task='regression', target_column=column)
319+
# reg_metrics = reg_efficacy.evaluate(real_data, synthetic_data)
320+
# print("=== Regression Efficacy Metrics ===", column)
321+
# print(reg_metrics)
322+
323+
reg_efficacy = EfficacyMetrics(task='regression', target_column="ugpa")
324+
reg_metrics = reg_efficacy.evaluate(real_data, synthetic_data)
325+
print("=== Regression Efficacy Metrics === UGPA")
326+
print(reg_metrics)
327+
328+
clf_efficacy = EfficacyMetrics(task='classification', target_column="bar")
329+
clf_metrics = clf_efficacy.evaluate(real_data, synthetic_data)
330+
print("=== Classification Efficacy Metrics === BAR")
331+
print(clf_metrics)
332+
333+
334+
dp = DisclosureProtection(real_data, synthetic_data)
335+
dp_score = dp.score()
336+
dp_report = dp.report()
337+
338+
print("=== Disclosure Protection ===")
339+
print(f"Score: {dp_score:.3f}")
340+
print("Detailed Report:", dp_report)
341+
342+
330343
setResult(json.dumps({
331344
'type': 'distribution',
332345
'real': cloned_real_data.to_json(orient="records"),
@@ -354,9 +367,9 @@ def run():
354367
'titleKey': 'syntheticData.diagnosticsTitle',
355368
'showIndex' : False,
356369
'data': report_df.to_json(orient="records"),
357-
# 'postContent': json.dumps([{
358-
# 'contentType' : 'correlationSyntheticData'
359-
# }])
370+
'postContent': json.dumps([{
371+
'contentType' : 'correlationSyntheticData'
372+
}])
360373
},
361374
{'reportType': 'bivariateDistributionSyntheticData'}
362375
]

src/components/DistributionReport.tsx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,10 @@ export const DistributionReport = (
189189
'correlationSyntheticData'
190190
) {
191191
return (
192-
<div className="grid lg:grid-cols-[50%_50%] grid-cols-[100%]">
192+
<div
193+
key={`index`}
194+
className="grid lg:grid-cols-[50%_50%] grid-cols-[100%]"
195+
>
193196
<div className="col-[1] lg:col-[1]">
194197
<CorrelationMatrix
195198
key={

src/components/pyodide/worker.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ self.onmessage = async (e: MessageData) => {
125125
'/kmodes-0.12.2-py2.py3-none-any.whl',
126126
'/unsupervised_bias_detection-0.2.1-py3-none-any.whl',
127127
'/copulas-0.12.1-py3-none-any.whl',
128-
'/python_synthpop-0.1-py3-none-any.whl',
128+
'/python_synthpop-0.1.1-py3-none-any.whl',
129129
]);
130130

131131
// const micropip = self.pyodide.pyimport('micropip');

0 commit comments

Comments
 (0)