@@ -31,49 +31,6 @@ from js import setOutputData
3131from js import nanTreatment
3232
3333
34- class GaussianCopulaSynthesizer:
35- def __init__(self):
36- self.means = None
37- self.cov_matrix = None
38- self.scaler = None
39- self.data_marginals = None
40-
41- def fit(self, data):
42- """
43- Fit the Gaussian Copula model to the given data.
44- """
45- # Step 1: Store data marginals (quantiles for each feature)
46- self.data_marginals = []
47- for col in data.columns:
48- sorted_data = np.sort(data[col])
49- quantiles = np.linspace(0, 1, len(sorted_data))
50- self.data_marginals.append((sorted_data, quantiles, col))
51-
52- # Step 2: Convert data to normal distribution using CDF (Gaussianization)
53- uniform_data = data.rank(pct=True) # Get percentile rank for each column (empirical CDF)
54- gaussian_data = norm.ppf(uniform_data) # Convert uniform to standard normal
55-
56- # Step 3: Fit a multivariate Gaussian to the normalized data
57- self.means = gaussian_data.mean(axis=0)
58- self.cov_matrix = np.cov(gaussian_data, rowvar=False)
59-
60- def sample(self, n_samples):
61- """
62- Generate synthetic data using the fitted Gaussian Copula model.
63- """
64- # Step 1: Sample from the multivariate normal distribution
65- synthetic_gaussian = np.random.multivariate_normal(self.means, self.cov_matrix, n_samples)
66-
67- # Step 2: Convert back to uniform distribution using CDF (normal -> uniform)
68- synthetic_uniform = norm.cdf(synthetic_gaussian)
69-
70- # Step 3: Map uniform data back to the original marginals
71- synthetic_data = pd.DataFrame()
72- for i, (sorted_data, quantiles, col) in enumerate(self.data_marginals):
73- synthetic_data[col] = np.interp(synthetic_uniform[:, i], quantiles, sorted_data)
74-
75- return synthetic_data
76-
7734def run():
7835 csv_data = StringIO(data)
7936
@@ -140,10 +97,10 @@ def run():
14097 setResult(json.dumps(
14198 {'type': 'data-set-preview', 'data': ''}
14299 ))
100+
143101 if isDemo:
144102 real_data['sex'] = real_data['sex'].replace({1: 'male', 2: 'female'})
145103
146-
147104 print(real_data.isnull().sum())
148105
149106 md_handler = MissingDataHandler()
@@ -152,9 +109,6 @@ def run():
152109 column_dtypes = md_handler.get_column_dtypes(real_data)
153110
154111
155- # if isDemo:
156- # column_dtypes['sex'] = 'categorical'
157-
158112 print("Column Data Types:", column_dtypes)
159113
160114
@@ -223,33 +177,7 @@ def run():
223177 }))
224178
225179 cloned_real_data = df_imputed.copy()
226-
227- # if (sdgMethod == 'cart'):
228- # spop = Synthpop(method='cart')
229- # spop = Synthpop()
230- # spop.fit(real_data, dtypes=dtypes_dict)
231- # synthetic_data = spop.generate(k=samples)
232-
233- # if (sdgMethod == 'gc'):
234- # Initialize synthesizer and fit it to the data
235- # synthesizer = GaussianCopulaSynthesizer()
236-
237- # Handle NaN values based on the selected treatment method
238- # if nanTreatment == 'drop':
239- # df_imputed = df_imputed.dropna()
240- # elif nanTreatment == 'impute':
241- # Use mean imputation for numerical columns and mode imputation for categorical columns
242- # for column in df_imputed.columns:
243- # if column_dtypes[column] == 'categorical':
244- # df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
245- # else:
246- # df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
247-
248- #synthesizer.fit(df_imputed)
249-
250- # Generate synthetic data
251- # synthetic_data = synthesizer.sample(samples)
252-
180+
253181 synth_df_decoded = synthetic_data.copy()
254182
255183 # Convert categorical variables to numerical values
0 commit comments