Skip to content

Commit 0905d30

Browse files
authored
Merge pull request #71 from NGO-Algorithm-Audit/feature/remove-old.gaussian
cleanup code
2 parents d509a6d + d45683f commit 0905d30

File tree

1 file changed

+2
-74
lines changed

1 file changed

+2
-74
lines changed

src/assets/synthetic-data.tsx

Lines changed: 2 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -31,49 +31,6 @@ from js import setOutputData
3131
from js import nanTreatment
3232
3333
34-
class GaussianCopulaSynthesizer:
35-
def __init__(self):
36-
self.means = None
37-
self.cov_matrix = None
38-
self.scaler = None
39-
self.data_marginals = None
40-
41-
def fit(self, data):
42-
"""
43-
Fit the Gaussian Copula model to the given data.
44-
"""
45-
# Step 1: Store data marginals (quantiles for each feature)
46-
self.data_marginals = []
47-
for col in data.columns:
48-
sorted_data = np.sort(data[col])
49-
quantiles = np.linspace(0, 1, len(sorted_data))
50-
self.data_marginals.append((sorted_data, quantiles, col))
51-
52-
# Step 2: Convert data to normal distribution using CDF (Gaussianization)
53-
uniform_data = data.rank(pct=True) # Get percentile rank for each column (empirical CDF)
54-
gaussian_data = norm.ppf(uniform_data) # Convert uniform to standard normal
55-
56-
# Step 3: Fit a multivariate Gaussian to the normalized data
57-
self.means = gaussian_data.mean(axis=0)
58-
self.cov_matrix = np.cov(gaussian_data, rowvar=False)
59-
60-
def sample(self, n_samples):
61-
"""
62-
Generate synthetic data using the fitted Gaussian Copula model.
63-
"""
64-
# Step 1: Sample from the multivariate normal distribution
65-
synthetic_gaussian = np.random.multivariate_normal(self.means, self.cov_matrix, n_samples)
66-
67-
# Step 2: Convert back to uniform distribution using CDF (normal -> uniform)
68-
synthetic_uniform = norm.cdf(synthetic_gaussian)
69-
70-
# Step 3: Map uniform data back to the original marginals
71-
synthetic_data = pd.DataFrame()
72-
for i, (sorted_data, quantiles, col) in enumerate(self.data_marginals):
73-
synthetic_data[col] = np.interp(synthetic_uniform[:, i], quantiles, sorted_data)
74-
75-
return synthetic_data
76-
7734
def run():
7835
csv_data = StringIO(data)
7936
@@ -140,10 +97,10 @@ def run():
14097
setResult(json.dumps(
14198
{'type': 'data-set-preview', 'data': ''}
14299
))
100+
143101
if isDemo:
144102
real_data['sex'] = real_data['sex'].replace({1: 'male', 2: 'female'})
145103
146-
147104
print(real_data.isnull().sum())
148105
149106
md_handler = MissingDataHandler()
@@ -152,9 +109,6 @@ def run():
152109
column_dtypes = md_handler.get_column_dtypes(real_data)
153110
154111
155-
# if isDemo:
156-
# column_dtypes['sex'] = 'categorical'
157-
158112
print("Column Data Types:", column_dtypes)
159113
160114
@@ -223,33 +177,7 @@ def run():
223177
}))
224178
225179
cloned_real_data = df_imputed.copy()
226-
227-
# if (sdgMethod == 'cart'):
228-
# spop = Synthpop(method='cart')
229-
# spop = Synthpop()
230-
# spop.fit(real_data, dtypes=dtypes_dict)
231-
# synthetic_data = spop.generate(k=samples)
232-
233-
# if (sdgMethod == 'gc'):
234-
# Initialize synthesizer and fit it to the data
235-
# synthesizer = GaussianCopulaSynthesizer()
236-
237-
# Handle NaN values based on the selected treatment method
238-
# if nanTreatment == 'drop':
239-
# df_imputed = df_imputed.dropna()
240-
# elif nanTreatment == 'impute':
241-
# Use mean imputation for numerical columns and mode imputation for categorical columns
242-
# for column in df_imputed.columns:
243-
# if column_dtypes[column] == 'categorical':
244-
# df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
245-
# else:
246-
# df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
247-
248-
#synthesizer.fit(df_imputed)
249-
250-
# Generate synthetic data
251-
# synthetic_data = synthesizer.sample(samples)
252-
180+
253181
synth_df_decoded = synthetic_data.copy()
254182
255183
# Convert categorical variables to numerical values

0 commit comments

Comments
 (0)