@@ -7,7 +7,7 @@ import warnings
77import scipy.stats as stats
88from scipy.stats import norm, ks_2samp
99from sklearn.preprocessing import LabelEncoder
10- from synthpop import MissingDataHandler, DataProcessor, CARTMethod
10+ from synthpop import MissingDataHandler, DataProcessor, CARTMethod, GaussianCopulaMethod
1111from synthpop.metrics import (
1212 MetricsReport,
1313 EfficacyMetrics,
@@ -177,11 +177,16 @@ def run():
177177 # Preprocess the data: transforms raw data into a numerical format
178178 processed_data = processor.preprocess(df_imputed)
179179
180- cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
181- cart.fit(processed_data)
182-
183- synthetic_processed = cart.sample(samples)
184-
180+ if (sdgMethod == 'cart'):
181+ cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
182+ cart.fit(processed_data)
183+ synthetic_processed = cart.sample(samples)
184+
185+ if (sdgMethod == 'gc'):
186+ gc = GaussianCopulaMethod(metadata)
187+ gc.fit(processed_data)
188+ synthetic_processed = gc.sample(samples)
189+
185190 print("synthetic_processed (first 5 rows):", synthetic_processed.head())
186191
187192 synthetic_data = processor.postprocess(synthetic_processed)
@@ -225,25 +230,25 @@ def run():
225230 # spop.fit(real_data, dtypes=dtypes_dict)
226231 # synthetic_data = spop.generate(k=samples)
227232
228- if (sdgMethod == 'gc'):
233+ # if (sdgMethod == 'gc'):
229234 # Initialize synthesizer and fit it to the data
230- synthesizer = GaussianCopulaSynthesizer()
235+ # synthesizer = GaussianCopulaSynthesizer()
231236
232237 # Handle NaN values based on the selected treatment method
233- if nanTreatment == 'drop':
234- df_imputed = df_imputed.dropna()
235- elif nanTreatment == 'impute':
238+ # if nanTreatment == 'drop':
239+ # df_imputed = df_imputed.dropna()
240+ # elif nanTreatment == 'impute':
236241 # Use mean imputation for numerical columns and mode imputation for categorical columns
237- for column in df_imputed.columns:
238- if column_dtypes[column] == 'categorical':
239- df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
240- else:
241- df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
242+ # for column in df_imputed.columns:
243+ # if column_dtypes[column] == 'categorical':
244+ # df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
245+ # else:
246+ # df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
242247
243- synthesizer.fit(df_imputed)
248+ # synthesizer.fit(df_imputed)
244249
245250 # Generate synthetic data
246- synthetic_data = synthesizer.sample(samples)
251+ # synthetic_data = synthesizer.sample(samples)
247252
248253 synth_df_decoded = synthetic_data.copy()
249254
0 commit comments