@@ -266,10 +266,6 @@ def run():
266266 }))
267267
268268 cloned_real_data = real_data.copy()
269- # label_encoders = {}
270- # for column in real_data.select_dtypes(include=['object']).columns:
271- # label_encoders[column] = LabelEncoder()
272- # real_data[column] = label_encoders[column].fit_transform(real_data[column])
273269
274270 # if (sdgMethod == 'cart'):
275271 # spop = Synthpop(method='cart')
@@ -286,9 +282,6 @@ def run():
286282 synthetic_data = synthesizer.sample(samples)
287283
288284 synth_df_decoded = synthetic_data.copy()
289- # for column in synth_df_decoded.columns:
290- # if column in label_encoders:
291- # synth_df_decoded[column] = label_encoders[column].inverse_transform(synth_df_decoded[column])
292285
293286 # Convert categorical variables to numerical values
294287 df_encoded = real_data.copy()
@@ -298,15 +291,6 @@ def run():
298291 if column_dtypes[column] == 'categorical':
299292 df_encoded[column] = df_encoded[column].astype('category').cat.codes
300293 synth_df_encoded[column] = synth_df_encoded[column].astype('category').cat.codes
301-
302- # df_encoded['sex'] = df_encoded['sex'].astype('category').cat.codes
303- # df_encoded['race1'] = df_encoded['race1'].astype('category').cat.codes
304- # df_encoded['bar'] = df_encoded['bar'].astype('category').cat.codes
305-
306- # synth_df_encoded = synthetic_data.copy()
307- # synth_df_encoded['sex'] = synth_df_encoded['sex'].astype('category').cat.codes
308- # synth_df_encoded['race1'] = synth_df_encoded['race1'].astype('category').cat.codes
309- # synth_df_encoded['bar'] = synth_df_encoded['bar'].astype('category').cat.codes
310294
311295 # Output some results
312296 print("Original Data (first 5 rows):", real_data.head())
@@ -317,16 +301,45 @@ def run():
317301 # Store synthetic data for export
318302 setOutputData("syntheticData", synthetic_data.to_json(orient='records'))
319303
320- # results = run_diagnostic(real_data, synthetic_data, target_column='gpa')
321- # print('Results:', results)
322-
323304 report = MetricsReport(real_data, synthetic_data, metadata)
324305 report_df = report.generate_report()
325306 print('report_df:', report_df)
326307
327308 # combine empty synthetic data with original data and with encoded data
328309 combined_data = pd.concat((real_data.assign(realOrSynthetic='real'), synthetic_data.assign(realOrSynthetic='synthetic')), keys=['real','synthetic'], names=['Data'])
329310
311+ # for column in column_dtypes:
312+ # if column_dtypes[column] == 'categorical':
313+ # reg_efficacy = EfficacyMetrics(task='classification', target_column=column)
314+ # reg_metrics = reg_efficacy.evaluate(real_data, synthetic_data)
315+ # print("=== Regression Efficacy Metrics ===", column)
316+ # print(reg_metrics)
317+ # else:
318+ # reg_efficacy = EfficacyMetrics(task='regression', target_column=column)
319+ # reg_metrics = reg_efficacy.evaluate(real_data, synthetic_data)
320+ # print("=== Regression Efficacy Metrics ===", column)
321+ # print(reg_metrics)
322+
323+ reg_efficacy = EfficacyMetrics(task='regression', target_column="ugpa")
324+ reg_metrics = reg_efficacy.evaluate(real_data, synthetic_data)
325+ print("=== Regression Efficacy Metrics === UGPA")
326+ print(reg_metrics)
327+
328+ clf_efficacy = EfficacyMetrics(task='classification', target_column="bar")
329+ clf_metrics = clf_efficacy.evaluate(real_data, synthetic_data)
330+ print("=== Classification Efficacy Metrics === BAR")
331+ print(clf_metrics)
332+
333+
334+ dp = DisclosureProtection(real_data, synthetic_data)
335+ dp_score = dp.score()
336+ dp_report = dp.report()
337+
338+ print("=== Disclosure Protection ===")
339+ print(f"Score: {dp_score:.3f}")
340+ print("Detailed Report:", dp_report)
341+
342+
330343 setResult(json.dumps({
331344 'type': 'distribution',
332345 'real': cloned_real_data.to_json(orient="records"),
@@ -354,9 +367,9 @@ def run():
354367 'titleKey': 'syntheticData.diagnosticsTitle',
355368 'showIndex' : False,
356369 'data': report_df.to_json(orient="records"),
357- # 'postContent': json.dumps([{
358- # 'contentType' : 'correlationSyntheticData'
359- # }])
370+ 'postContent': json.dumps([{
371+ 'contentType' : 'correlationSyntheticData'
372+ }])
360373 },
361374 {'reportType': 'bivariateDistributionSyntheticData'}
362375 ]
0 commit comments