@@ -72,54 +72,6 @@ class GaussianCopulaSynthesizer:
7272 synthetic_data[col] = np.interp(synthetic_uniform[:, i], quantiles, sorted_data)
7373
7474 return synthetic_data
75-
76-
77- def evaluate_distribution(real_data, synthetic_data):
78- """
79- Compare the distribution of each column in the real and synthetic data using
80- the Kolmogorov-Smirnov (KS) test.
81- """
82- results = {}
83- for column in real_data.columns:
84- real_col = real_data[column].dropna()
85- synthetic_col = synthetic_data[column].dropna()
86-
87- # Perform the KS test
88- ks_stat, p_value = ks_2samp(real_col, synthetic_col)
89-
90- # Store the result
91- results[column] = {'ks_stat': ks_stat, 'p_value': p_value}
92- return results
93-
94- def evaluate_correlations(real_data, synthetic_data):
95- """
96- Compare the pairwise correlation matrices of the real and synthetic data.
97- """
98- real_corr = real_data.corr()
99- synthetic_corr = synthetic_data.corr()
100-
101- # Compute the difference between the correlation matrices
102- corr_diff = np.abs(real_corr - synthetic_corr)
103- return corr_diff.mean().mean() # Average correlation difference
104-
105- def run_diagnostic(real_data, synthetic_data, target_column):
106- """
107- Run diagnostics on synthetic data by evaluating distribution, correlations, and
108- classification model performance.
109- """
110- # Step 1: Evaluate distributions
111- distribution_results = evaluate_distribution(real_data, synthetic_data)
112-
113- # Step 2: Evaluate correlations
114- correlation_diff = evaluate_correlations(real_data, synthetic_data)
115-
116- # Aggregate results
117- diagnostics = {
118- 'distribution_results': distribution_results,
119- 'correlation_diff': correlation_diff
120- }
121-
122- return diagnostics
12375
12476def run():
12577 csv_data = StringIO(data)
@@ -219,7 +171,7 @@ def run():
219171 processor = DataProcessor(metadata)
220172
221173 # Preprocess the data: transforms raw data into a numerical format
222- processed_data = processor.preprocess(real_data )
174+ processed_data = processor.preprocess(df_imputed )
223175
224176 cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
225177 cart.fit(processed_data)
@@ -235,10 +187,6 @@ def run():
235187 # categorical
236188
237189 print("Synthetic Data (first 5 rows):", synthetic_data.head())
238-
239-
240- # dtypes_dict = real_data.dtypes.to_dict()
241- # dtypes_dict = {k: 'float' if (v == 'float64' or v == 'int64') else 'category' if (v == 'O' or v =='bool') else v for k, v in dtypes_dict.items()}
242190
243191
244192 setResult(json.dumps({
@@ -249,7 +197,7 @@ def run():
249197
250198
251199 dataInfo = []
252- for column in real_data .columns:
200+ for column in df_imputed .columns:
253201 dataInfo.append({
254202 'key': column,
255203 'value': column_dtypes[column]
@@ -265,7 +213,7 @@ def run():
265213 'key': 'syntheticData.columnsInDatasetInfo'
266214 }))
267215
268- cloned_real_data = real_data .copy()
216+ cloned_real_data = df_imputed .copy()
269217
270218 # if (sdgMethod == 'cart'):
271219 # spop = Synthpop(method='cart')
@@ -276,15 +224,15 @@ def run():
276224 if (sdgMethod == 'gc'):
277225 # Initialize synthesizer and fit it to the data
278226 synthesizer = GaussianCopulaSynthesizer()
279- synthesizer.fit(real_data )
227+ synthesizer.fit(df_imputed )
280228
281229 # Generate synthetic data
282230 synthetic_data = synthesizer.sample(samples)
283231
284232 synth_df_decoded = synthetic_data.copy()
285233
286234 # Convert categorical variables to numerical values
287- df_encoded = real_data .copy()
235+ df_encoded = df_imputed .copy()
288236 synth_df_encoded = synthetic_data.copy()
289237
290238 for column in column_dtypes:
@@ -293,45 +241,45 @@ def run():
293241 synth_df_encoded[column] = synth_df_encoded[column].astype('category').cat.codes
294242
295243 # Output some results
296- print("Original Data (first 5 rows):", real_data .head())
244+ print("Original Data (first 5 rows):", df_imputed .head())
297245 print("Synthetic Data (first 5 rows):", synthetic_data.head())
298246
299247 print("Synthetic Data decoded (first 5 rows):", synth_df_decoded.head())
300248
301249 # Store synthetic data for export
302250 setOutputData("syntheticData", synthetic_data.to_json(orient='records'))
303251
304- report = MetricsReport(real_data , synthetic_data, metadata)
252+ report = MetricsReport(df_imputed , synthetic_data, metadata)
305253 report_df = report.generate_report()
306254 print('report_df:', report_df)
307255
308256 # combine empty synthetic data with original data and with encoded data
309- combined_data = pd.concat((real_data .assign(realOrSynthetic='real'), synthetic_data.assign(realOrSynthetic='synthetic')), keys=['real','synthetic'], names=['Data'])
310-
311- # for column in column_dtypes:
312- # if column_dtypes[column] == 'categorical':
313- # reg_efficacy = EfficacyMetrics(task='classification', target_column=column)
314- # reg_metrics = reg_efficacy.evaluate(real_data , synthetic_data)
315- # print("=== Regression Efficacy Metrics ===", column)
316- # print(reg_metrics)
317- # else:
318- # reg_efficacy = EfficacyMetrics(task='regression', target_column=column)
319- # reg_metrics = reg_efficacy.evaluate(real_data , synthetic_data)
320- # print("=== Regression Efficacy Metrics ===", column)
321- # print(reg_metrics)
322-
323- reg_efficacy = EfficacyMetrics(task='regression', target_column="ugpa")
324- reg_metrics = reg_efficacy.evaluate(real_data , synthetic_data)
325- print("=== Regression Efficacy Metrics === UGPA")
326- print(reg_metrics)
257+ combined_data = pd.concat((df_imputed .assign(realOrSynthetic='real'), synthetic_data.assign(realOrSynthetic='synthetic')), keys=['real','synthetic'], names=['Data'])
258+
259+ for column in column_dtypes:
260+ if column_dtypes[column] == 'categorical':
261+ reg_efficacy = EfficacyMetrics(task='classification', target_column=column)
262+ reg_metrics = reg_efficacy.evaluate(df_imputed , synthetic_data)
263+ print("=== Regression Efficacy Metrics ===", column)
264+ print(reg_metrics)
265+ else:
266+ reg_efficacy = EfficacyMetrics(task='regression', target_column=column)
267+ reg_metrics = reg_efficacy.evaluate(df_imputed , synthetic_data)
268+ print("=== Regression Efficacy Metrics ===", column)
269+ print(reg_metrics)
270+
271+ # reg_efficacy = EfficacyMetrics(task='regression', target_column="ugpa")
272+ # reg_metrics = reg_efficacy.evaluate(df_imputed , synthetic_data)
273+ # print("=== Regression Efficacy Metrics === UGPA")
274+ # print(reg_metrics)
327275
328276 clf_efficacy = EfficacyMetrics(task='classification', target_column="bar")
329- clf_metrics = clf_efficacy.evaluate(real_data , synthetic_data)
277+ clf_metrics = clf_efficacy.evaluate(df_imputed , synthetic_data)
330278 print("=== Classification Efficacy Metrics === BAR")
331279 print(clf_metrics)
332280
333281
334- dp = DisclosureProtection(real_data , synthetic_data)
282+ dp = DisclosureProtection(df_imputed , synthetic_data)
335283 dp_score = dp.score()
336284 dp_report = dp.report()
337285
@@ -367,9 +315,9 @@ def run():
367315 'titleKey': 'syntheticData.diagnosticsTitle',
368316 'showIndex' : False,
369317 'data': report_df.to_json(orient="records"),
370- 'postContent': json.dumps( [{
318+ 'postContent': [{
371319 'contentType' : 'correlationSyntheticData'
372- }])
320+ }]
373321 },
374322 {'reportType': 'bivariateDistributionSyntheticData'}
375323 ]
@@ -400,7 +348,7 @@ if data != 'INIT':
400348
401349/*
402350
403- # df_numeric = real_data .apply(pd.to_numeric, errors='coerce')
351+ # df_numeric = df_imputed .apply(pd.to_numeric, errors='coerce')
404352 # synth_df_numeric = synthetic_data.apply(pd.to_numeric, errors='coerce')
405353
406354 # 'syntheticCorrelations': np.abs(df_numeric.corr() - synth_df_numeric.corr()).to_json(orient="records"),
0 commit comments