@@ -148,12 +148,12 @@ def run():
148148 real_data['sex'] = real_data['sex'].map({1: 'male', 2: 'female'})
149149
150150 cloned_real_data = real_data.copy()
151+ label_encoders = {}
152+ for column in real_data.select_dtypes(include=['object']).columns:
153+ label_encoders[column] = LabelEncoder()
154+ real_data[column] = label_encoders[column].fit_transform(real_data[column])
151155
152156 if (sdgMethod == 'cart'):
153- label_encoders = {}
154- for column in real_data.select_dtypes(include=['object']).columns:
155- label_encoders[column] = LabelEncoder()
156- real_data[column] = label_encoders[column].fit_transform(real_data[column])
157157 # spop = Synthpop(method='cart')
158158 spop = Synthpop()
159159 spop.fit(real_data, dtypes=dtypes_dict)
@@ -167,10 +167,18 @@ def run():
167167 # Generate synthetic data
168168 synthetic_data = synthesizer.sample(samples)
169169
170+ synth_df_decoded = synthetic_data.copy()
171+ for column in synth_df_decoded.columns:
172+ if column in label_encoders:
173+ synth_df_decoded[column] = label_encoders[column].inverse_transform(synth_df_decoded[column])
174+
175+
170176 # Output some results
171177 print("Original Data (first 5 rows):", real_data.head())
172178 print("Synthetic Data (first 5 rows):", synthetic_data.head())
173179
180+ print("Synthetic Data decoded (first 5 rows):", synth_df_decoded.head())
181+
174182 # Store synthetic data for export
175183 setOutputData("syntheticData", synthetic_data.to_json(orient='records'))
176184
@@ -197,7 +205,6 @@ def run():
197205 ))
198206 setResult(json.dumps({'type': 'table', 'data': synthetic_data.head().to_json(orient="records")}))
199207
200- setResult(json.dumps({'type': 'heatmap', 'real': real_data.corr().to_json(orient="records"), 'synthetic': synthetic_data.corr().to_json(orient="records")}))
201208
202209 # copy dataframe and assign NaN to all values
203210 synth_df = real_data.copy()
@@ -207,7 +214,10 @@ def run():
207214 combined_data = pd.concat((real_data.assign(realOrSynthetic='real'), synth_df.assign(realOrSynthetic='synthetic')), keys=['real','synthetic'], names=['Data'])
208215 # combined_data_encoded = pd.concat((df_encoded.assign(realOrSynthetic='real_encoded'), synth_df.assign(realOrSynthetic='synthetic')), keys=['real_encoded','synthetic'], names=['Data'])
209216
210- setResult(json.dumps({'type': 'distribution', 'real': cloned_real_data.to_json(orient="records"), 'synthetic': synthetic_data.to_json(orient="records"), 'dataTypes': json.dumps(dtypes_dict), 'combined_data' : combined_data.to_json(orient="records")}))
217+ # setResult(json.dumps({'type': 'distribution', 'real': real_data.to_json(orient="records"), 'synthetic': synthetic_data.to_json(orient="records"), 'dataTypes': json.dumps(dtypes_dict), 'combined_data' : combined_data.to_json(orient="records")}))
218+ setResult(json.dumps({'type': 'distribution', 'real': cloned_real_data.to_json(orient="records"), 'synthetic': synth_df_decoded.to_json(orient="records"), 'dataTypes': json.dumps(dtypes_dict), 'combined_data' : combined_data.to_json(orient="records")}))
219+
220+ setResult(json.dumps({'type': 'heatmap', 'real': real_data.corr().to_json(orient="records"), 'synthetic': synthetic_data.corr().to_json(orient="records")}))
211221
212222 return
213223
0 commit comments