Skip to content

Commit 74caa29

Browse files
committed
distribution barchart now also supports categorical data
1 parent 47d6904 commit 74caa29

File tree

3 files changed

+141
-111
lines changed

3 files changed

+141
-111
lines changed

src/assets/synthetic-data.tsx

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -148,12 +148,12 @@ def run():
148148
real_data['sex'] = real_data['sex'].map({1: 'male', 2: 'female'})
149149
150150
cloned_real_data = real_data.copy()
151+
label_encoders = {}
152+
for column in real_data.select_dtypes(include=['object']).columns:
153+
label_encoders[column] = LabelEncoder()
154+
real_data[column] = label_encoders[column].fit_transform(real_data[column])
151155
152156
if (sdgMethod == 'cart'):
153-
label_encoders = {}
154-
for column in real_data.select_dtypes(include=['object']).columns:
155-
label_encoders[column] = LabelEncoder()
156-
real_data[column] = label_encoders[column].fit_transform(real_data[column])
157157
# spop = Synthpop(method='cart')
158158
spop = Synthpop()
159159
spop.fit(real_data, dtypes=dtypes_dict)
@@ -167,10 +167,18 @@ def run():
167167
# Generate synthetic data
168168
synthetic_data = synthesizer.sample(samples)
169169
170+
synth_df_decoded = synthetic_data.copy()
171+
for column in synth_df_decoded.columns:
172+
if column in label_encoders:
173+
synth_df_decoded[column] = label_encoders[column].inverse_transform(synth_df_decoded[column])
174+
175+
170176
# Output some results
171177
print("Original Data (first 5 rows):", real_data.head())
172178
print("Synthetic Data (first 5 rows):", synthetic_data.head())
173179
180+
print("Synthetic Data decoded (first 5 rows):", synth_df_decoded.head())
181+
174182
# Store synthetic data for export
175183
setOutputData("syntheticData", synthetic_data.to_json(orient='records'))
176184
@@ -197,7 +205,6 @@ def run():
197205
))
198206
setResult(json.dumps({'type': 'table', 'data': synthetic_data.head().to_json(orient="records")}))
199207
200-
setResult(json.dumps({'type': 'heatmap', 'real': real_data.corr().to_json(orient="records"), 'synthetic': synthetic_data.corr().to_json(orient="records")}))
201208
202209
# copy dataframe and assign NaN to all values
203210
synth_df = real_data.copy()
@@ -207,7 +214,10 @@ def run():
207214
combined_data = pd.concat((real_data.assign(realOrSynthetic='real'), synth_df.assign(realOrSynthetic='synthetic')), keys=['real','synthetic'], names=['Data'])
208215
# combined_data_encoded = pd.concat((df_encoded.assign(realOrSynthetic='real_encoded'), synth_df.assign(realOrSynthetic='synthetic')), keys=['real_encoded','synthetic'], names=['Data'])
209216
210-
setResult(json.dumps({'type': 'distribution', 'real': cloned_real_data.to_json(orient="records"), 'synthetic': synthetic_data.to_json(orient="records"), 'dataTypes': json.dumps(dtypes_dict), 'combined_data' : combined_data.to_json(orient="records")}))
217+
# setResult(json.dumps({'type': 'distribution', 'real': real_data.to_json(orient="records"), 'synthetic': synthetic_data.to_json(orient="records"), 'dataTypes': json.dumps(dtypes_dict), 'combined_data' : combined_data.to_json(orient="records")}))
218+
setResult(json.dumps({'type': 'distribution', 'real': cloned_real_data.to_json(orient="records"), 'synthetic': synth_df_decoded.to_json(orient="records"), 'dataTypes': json.dumps(dtypes_dict), 'combined_data' : combined_data.to_json(orient="records")}))
219+
220+
setResult(json.dumps({'type': 'heatmap', 'real': real_data.corr().to_json(orient="records"), 'synthetic': synthetic_data.corr().to_json(orient="records")}))
211221
212222
return
213223

src/components/componentMapper.tsx

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,14 +185,14 @@ export default function ComponentMapper({
185185

186186
const realData = JSON.parse(resultItem.real);
187187
const syntheticData = JSON.parse(resultItem.synthetic);
188-
188+
const dataTypes = JSON.parse(resultItem.dataTypes);
189189
console.log('realData', realData);
190190
return (
191191
<div key={`distribution-${index}`}>
192192
<UnivariateCharts
193193
realData={realData}
194194
syntheticData={syntheticData}
195-
dataTypes={JSON.parse(resultItem.dataTypes)}
195+
dataTypes={dataTypes}
196196
combined_data={JSON.parse(
197197
resultItem.combined_data
198198
)}
@@ -229,6 +229,11 @@ export default function ComponentMapper({
229229
key={columnIndex}
230230
>
231231
<DistributionBarChart
232+
dataType={
233+
dataTypes[
234+
columnName
235+
]
236+
}
232237
realData={
233238
realDataColumn
234239
}

0 commit comments

Comments
 (0)