Skip to content

Commit c1c2265

Browse files
committed
Add GaussianCopulaMethod for synthetic data generation and refactor sampling logic
1 parent 6e6cf6f commit c1c2265

File tree

1 file changed

+23
-18
lines changed

1 file changed

+23
-18
lines changed

src/assets/synthetic-data.tsx

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import warnings
77
import scipy.stats as stats
88
from scipy.stats import norm, ks_2samp
99
from sklearn.preprocessing import LabelEncoder
10-
from synthpop import MissingDataHandler, DataProcessor, CARTMethod
10+
from synthpop import MissingDataHandler, DataProcessor, CARTMethod, GaussianCopulaMethod
1111
from synthpop.metrics import (
1212
MetricsReport,
1313
EfficacyMetrics,
@@ -177,11 +177,16 @@ def run():
177177
# Preprocess the data: transforms raw data into a numerical format
178178
processed_data = processor.preprocess(df_imputed)
179179
180-
cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
181-
cart.fit(processed_data)
182-
183-
synthetic_processed = cart.sample(samples)
184-
180+
if (sdgMethod == 'cart'):
181+
cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
182+
cart.fit(processed_data)
183+
synthetic_processed = cart.sample(samples)
184+
185+
if (sdgMethod == 'gc'):
186+
gc = GaussianCopulaMethod(metadata)
187+
gc.fit(processed_data)
188+
synthetic_processed = gc.sample(samples)
189+
185190
print("synthetic_processed (first 5 rows):", synthetic_processed.head())
186191
187192
synthetic_data = processor.postprocess(synthetic_processed)
@@ -225,25 +230,25 @@ def run():
225230
# spop.fit(real_data, dtypes=dtypes_dict)
226231
# synthetic_data = spop.generate(k=samples)
227232
228-
if (sdgMethod == 'gc'):
233+
# if (sdgMethod == 'gc'):
229234
# Initialize synthesizer and fit it to the data
230-
synthesizer = GaussianCopulaSynthesizer()
235+
# synthesizer = GaussianCopulaSynthesizer()
231236
232237
# Handle NaN values based on the selected treatment method
233-
if nanTreatment == 'drop':
234-
df_imputed = df_imputed.dropna()
235-
elif nanTreatment == 'impute':
238+
# if nanTreatment == 'drop':
239+
# df_imputed = df_imputed.dropna()
240+
# elif nanTreatment == 'impute':
236241
# Use mean imputation for numerical columns and mode imputation for categorical columns
237-
for column in df_imputed.columns:
238-
if column_dtypes[column] == 'categorical':
239-
df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
240-
else:
241-
df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
242+
# for column in df_imputed.columns:
243+
# if column_dtypes[column] == 'categorical':
244+
# df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
245+
# else:
246+
# df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
242247
243-
synthesizer.fit(df_imputed)
248+
#synthesizer.fit(df_imputed)
244249
245250
# Generate synthetic data
246-
synthetic_data = synthesizer.sample(samples)
251+
# synthetic_data = synthesizer.sample(samples)
247252
248253
synth_df_decoded = synthetic_data.copy()
249254

0 commit comments

Comments
 (0)