Skip to content

Commit 03a7604

Browse files
committed
additional data in report
1 parent 960e184 commit 03a7604

File tree

3 files changed

+66
-19
lines changed

3 files changed

+66
-19
lines changed

src/assets/synthetic-data.tsx

Lines changed: 44 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,8 @@ def run():
161161
missingness_dict = md_handler.detect_missingness(real_data)
162162
print("Detected Missingness Type:", missingness_dict)
163163
164+
missingness_dict_df = pd.DataFrame(missingness_dict, index=[0])
165+
164166
df_imputed = md_handler.apply_imputation(real_data, missingness_dict)
165167
166168
@@ -250,7 +252,7 @@ def run():
250252
setOutputData("syntheticData", synthetic_data.to_json(orient='records'))
251253
252254
report = MetricsReport(df_imputed, synthetic_data, metadata)
253-
report_df = report.generate_report()
255+
report_df = report.generate_report()
254256
print('report_df:', report_df)
255257
256258
# combine empty synthetic data with original data and with encoded data
@@ -268,21 +270,17 @@ def run():
268270
print("=== Regression Efficacy Metrics ===", column)
269271
print(reg_metrics)
270272
271-
# reg_efficacy = EfficacyMetrics(task='regression', target_column="ugpa")
272-
# reg_metrics = reg_efficacy.evaluate(df_imputed, synthetic_data)
273-
# print("=== Regression Efficacy Metrics === UGPA")
274-
# print(reg_metrics)
275-
276273
clf_efficacy = EfficacyMetrics(task='classification', target_column="bar")
277274
clf_metrics = clf_efficacy.evaluate(df_imputed, synthetic_data)
278275
print("=== Classification Efficacy Metrics === BAR")
279276
print(clf_metrics)
280277
281-
282278
dp = DisclosureProtection(df_imputed, synthetic_data)
283279
dp_score = dp.score()
284280
dp_report = dp.report()
285281
282+
dp_report_df = pd.DataFrame(dp_report, index=[0])
283+
286284
print("=== Disclosure Protection ===")
287285
print(f"Score: {dp_score:.3f}")
288286
print("Detailed Report:", dp_report)
@@ -296,12 +294,26 @@ def run():
296294
'combined_data' : combined_data.to_json(orient="records"),
297295
'realCorrelations': df_encoded.corr().to_json(orient="records"),
298296
'synthDataCorrelations': synth_df_encoded.corr().to_json(orient="records"),
299-
'reports' : [
297+
'reports' : [
300298
{
301299
'reportType': 'heading',
302-
'headingKey': 'syntheticData.cartModelTitle' if sdgMethod == 'cart' else 'syntheticData.gaussianCopulaModelTitle'
300+
'headingKey': 'syntheticData.handlingMissingDataTitle'
301+
},
302+
{
303+
'reportType': 'text',
304+
'textKey': 'syntheticData.handlingMissingDataDescription'
303305
},
304-
{
306+
{
307+
'reportType': 'table',
308+
'titleKey': 'syntheticData.handlingMissingDataTableTitle',
309+
'showIndex' : False,
310+
'data': missingness_dict_df.to_json(orient="records"),
311+
},
312+
{
313+
'reportType': 'heading',
314+
'headingKey': 'syntheticData.cartModelTitle' if sdgMethod == 'cart' else 'syntheticData.gaussianCopulaModelTitle'
315+
},
316+
{
305317
'reportType': 'text',
306318
'textKey': 'syntheticData.cartModelDescription' if sdgMethod == 'cart' else 'syntheticData.gaussianCopulaModelDescription'
307319
},
@@ -310,6 +322,10 @@ def run():
310322
'headingKey': 'syntheticData.evaluationOfGeneratedDataTitle'
311323
},
312324
{'reportType': 'univariateDistributionSyntheticData'},
325+
{
326+
'reportType': 'heading',
327+
'headingKey': 'syntheticData.diagnosticsReportTitle'
328+
},
313329
{
314330
'reportType': 'table',
315331
'titleKey': 'syntheticData.diagnosticsTitle',
@@ -319,7 +335,18 @@ def run():
319335
'contentType' : 'correlationSyntheticData'
320336
}]
321337
},
322-
{'reportType': 'bivariateDistributionSyntheticData'}
338+
{
339+
'reportType': 'table',
340+
'titleKey': 'syntheticData.disclosureProtectionTitle',
341+
'showIndex' : False,
342+
'data': dp_report_df.to_json(orient="records"),
343+
},
344+
{
345+
'reportType': 'heading',
346+
'headingKey': 'syntheticData.bivariateDistributionSyntheticDataTitle'
347+
},
348+
{'reportType': 'bivariateDistributionSyntheticData'},
349+
323350
]
324351
}))
325352
@@ -334,6 +361,12 @@ def run():
334361
'data': synthetic_data.head().to_json(orient="records")
335362
}))
336363
364+
365+
setResult(json.dumps({
366+
'type': 'heading',
367+
'headingKey': 'syntheticData.moreInfoTitle'
368+
}))
369+
337370
setResult(json.dumps({
338371
'type': 'text',
339372
'key': 'syntheticData.moreInfo'

src/locales/en.json

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,13 +125,20 @@
125125
"heading": "0. Preview of real data"
126126
},
127127
"columnsInDataset": "1. Data types detection",
128+
"handlingMissingDataTitle": "2. Handling missing data",
129+
"handlingMissingDataDescription": "Handling missing data description",
130+
"handlingMissingDataTableTitle": "Columns with missing data",
128131
"_explanatoryDataAnalysisTitle": "2. Explanatory data analysis",
129-
"cartModelTitle": "2. Method: CART model",
130-
"gaussianCopulaModelTitle": "2. Method: Gaussian Copula model",
132+
"cartModelTitle": "3. Method: CART model",
133+
"gaussianCopulaModelTitle": "3. Method: Gaussian Copula model",
131134
"cartModelDescription": "The CART (Classification and Regression Trees) method generates synthetic data by learning patterns from real data through a decision tree that splits data into homogeneous groups based on feature values. It predicts averages for numerical data and assigns the most common category for categorical data, using these predictions to create new synthetic points.",
132-
"evaluationOfGeneratedDataTitle": "3. Evaluation of generated data",
133-
"outputDataTitle": "4. Generated synthetic data",
135+
"evaluationOfGeneratedDataTitle": "4. Evaluation of generated data",
136+
"diagnosticsReportTitle": "5. Diagnostic Report",
134137
"diagnosticsTitle": "Diagnostic Results",
138+
"disclosureProtectionTitle": "Disclosure protection",
139+
"bivariateDistributionSyntheticDataTitle": "6. Bivariate distributions",
140+
"outputDataTitle": "7. Generated synthetic data",
141+
"moreInfoTitle": "8. More information",
135142
"correlationDifference": "Correlation difference: {{correlationDifference}}",
136143
"univariateText": "{{samples}} synthetic data points are generated using CART. The figures below display the value frequency for each variable. The synthetic data is of high quality when the frequencies are approximately the same.",
137144
"bivariateText": "The figures below display the differences in value frequency for a combination of variables. For comparing two categorical variables, bar charts are plotted. For comparing a numerical and a categorical variables, a so called [violin plot](https://en.wikipedia.org/wiki/Violin_plot) is shown. For comparing two numercial variables, a [LOESS plot](https://en.wikipedia.org/wiki/Local_regression) is created. For all plots holds: the synthetic data is of high quality when the shape of the distributions in the synthetic data equal the distributions in the real data.",

src/locales/nl.json

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -125,13 +125,20 @@
125125
"heading": "0. Preview van de data"
126126
},
127127
"columnsInDataset": "1. Detectie van datatypes",
128-
"_explanatoryDataAnalysisTitle": "2. Explanatory data analyse",
128+
"handlingMissingDataTitle": "2. Handling missing data",
129+
"handlingMissingDataDescription": "Handling missing data description",
130+
"handlingMissingDataTableTitle": "Columns with missing data",
131+
"_explanatoryDataAnalysisTitle": "3. Explanatory data analyse",
129132
"cartModelTitle": "2. Methode: CART model",
130-
"gaussianCopulaModelTitle": "2. Methode: Gaussian Copula model",
133+
"gaussianCopulaModelTitle": "3. Methode: Gaussian Copula model",
131134
"cartModelDescription": "De CART-methode (Classification and Regression Trees) genereert synthetische data door patronen uit echte data te leren via een beslisboom die de data opdeelt in homogene groepen op basis van kenmerkwaarden. Voor numerieke data voorspelt de methode gemiddelden, en voor categorische data wijst het de meest voorkomende categorie toe. Deze voorspellingen worden vervolgens gebruikt om nieuwe synthetische gegevenspunten te creëren.",
132-
"evaluationOfGeneratedDataTitle": "3. Evaluatie van gegenereerde data",
133-
"outputDataTitle": "4. Output data",
135+
"evaluationOfGeneratedDataTitle": "4. Evaluatie van gegenereerde data",
136+
"diagnosticsReportTitle": "6. Diagnostisch rapport",
134137
"diagnosticsTitle": "Diagnostische Resultaten",
138+
"disclosureProtectionTitle": "Disclosure protection",
139+
"bivariateDistributionSyntheticDataTitle": "6. Bivariate distributies",
140+
"outputDataTitle": "7. Output data",
141+
"moreInfoTitle": "8. Meer informatie",
135142
"correlationDifference": "Correlatie verschil: {{correlationDifference}}",
136143
"moreInfo": "  \n \n \n \nWil je meer weten over synthetische data?\n \n \n \n- [python-synthpop op Github](https://github.com/NGO-Algorithm-Audit/python-synthpop)\n- [local-first web app op Github](https://github.com/NGO-Algorithm-Audit/local-first-web-tool/tree/main)\n- [Synthetische Data: wat, waarom en hoe?](https://royalsociety.org/-/media/policy/projects/privacy-enhancing-technologies/Synthetic_Data_Survey-24.pdf)\n- [Kennis Netwerk Synthetische Data](https://online.rijksinnovatiecommunity.nl/groups/399-kennisnetwerk-synthetischedata/welcome) (for Dutch public organizations)\n- [Synthetische data portaal van DUO](https://duo.nl/open_onderwijsdata/footer/synthetische-data.jsp)\n- [CART: synthpop resources](https://synthpop.org.uk/resources.html)\n- [Gaussian Copula - Synthetic Data Vault](https://docs.sdv.dev/sdv)"
137144
},

0 commit comments

Comments
 (0)