Skip to content

Commit 4e3b25e

Browse files
committed
Refactor categorical variable encoding in synthetic data processing for improved scalability
1 parent d7e79ff commit 4e3b25e

File tree

1 file changed

+24
-21
lines changed

1 file changed

+24
-21
lines changed

src/assets/synthetic-data.tsx

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -292,14 +292,21 @@ def run():
292292
293293
# Convert categorical variables to numerical values
294294
df_encoded = real_data.copy()
295-
df_encoded['sex'] = df_encoded['sex'].astype('category').cat.codes
296-
df_encoded['race1'] = df_encoded['race1'].astype('category').cat.codes
297-
df_encoded['bar'] = df_encoded['bar'].astype('category').cat.codes
298-
299295
synth_df_encoded = synthetic_data.copy()
300-
synth_df_encoded['sex'] = synth_df_encoded['sex'].astype('category').cat.codes
301-
synth_df_encoded['race1'] = synth_df_encoded['race1'].astype('category').cat.codes
302-
synth_df_encoded['bar'] = synth_df_encoded['bar'].astype('category').cat.codes
296+
297+
for column in column_dtypes:
298+
if column_dtypes[column] == 'categorical':
299+
df_encoded[column] = df_encoded[column].astype('category').cat.codes
300+
synth_df_encoded[column] = synth_df_encoded[column].astype('category').cat.codes
301+
302+
# df_encoded['sex'] = df_encoded['sex'].astype('category').cat.codes
303+
# df_encoded['race1'] = df_encoded['race1'].astype('category').cat.codes
304+
# df_encoded['bar'] = df_encoded['bar'].astype('category').cat.codes
305+
306+
# synth_df_encoded = synthetic_data.copy()
307+
# synth_df_encoded['sex'] = synth_df_encoded['sex'].astype('category').cat.codes
308+
# synth_df_encoded['race1'] = synth_df_encoded['race1'].astype('category').cat.codes
309+
# synth_df_encoded['bar'] = synth_df_encoded['bar'].astype('category').cat.codes
303310
304311
# Output some results
305312
print("Original Data (first 5 rows):", real_data.head())
@@ -312,7 +319,10 @@ def run():
312319
313320
# results = run_diagnostic(real_data, synthetic_data, target_column='gpa')
314321
# print('Results:', results)
315-
322+
323+
report = MetricsReport(real_data, synthetic_data, metadata)
324+
report_df = report.generate_report()
325+
print('report_df:', report_df)
316326
317327
# combine empty synthetic data with original data and with encoded data
318328
combined_data = pd.concat((real_data.assign(realOrSynthetic='real'), synthetic_data.assign(realOrSynthetic='synthetic')), keys=['real','synthetic'], names=['Data'])
@@ -339,22 +349,15 @@ def run():
339349
'headingKey': 'syntheticData.evaluationOfGeneratedDataTitle'
340350
},
341351
{'reportType': 'univariateDistributionSyntheticData'},
342-
# {
343-
# 'reportType': 'table',
344-
# 'titleKey': 'syntheticData.diagnosticsTitle',
345-
# 'showIndex' : False,
346-
# 'data': json.dumps([
347-
# {
348-
# 'attribute': key,
349-
# 'ks_stat': values['ks_stat'],
350-
# 'p_value': values['p_value']
351-
# }
352-
# for key, values in results['distribution_results'].items()
353-
# ]),
352+
{
353+
'reportType': 'table',
354+
'titleKey': 'syntheticData.diagnosticsTitle',
355+
'showIndex' : False,
356+
'data': report_df.to_json(orient="records"),
354357
# 'postContent': json.dumps([{
355358
# 'contentType' : 'correlationSyntheticData'
356359
# }])
357-
#},
360+
},
358361
{'reportType': 'bivariateDistributionSyntheticData'}
359362
]
360363
}))

0 commit comments

Comments
 (0)