additional data in report

devhelpr · devhelpr · commit 03a760411390 · 2025-03-19T20:04:02.000+01:00
diff --git a/src/assets/synthetic-data.tsx b/src/assets/synthetic-data.tsx
@@ -161,6 +161,8 @@ def run():
     missingness_dict = md_handler.detect_missingness(real_data)
     print("Detected Missingness Type:", missingness_dict)
 
+    missingness_dict_df = pd.DataFrame(missingness_dict, index=[0])
+
     df_imputed = md_handler.apply_imputation(real_data, missingness_dict)
     
 
@@ -250,7 +252,7 @@ def run():
     setOutputData("syntheticData", synthetic_data.to_json(orient='records'))
 
     report = MetricsReport(df_imputed, synthetic_data, metadata)
-    report_df = report.generate_report()
+    report_df = report.generate_report()    
     print('report_df:', report_df)
 
     # combine empty synthetic data with original data and with encoded data 
@@ -268,21 +270,17 @@ def run():
             print("=== Regression Efficacy Metrics ===", column)
             print(reg_metrics)
 
-    # reg_efficacy = EfficacyMetrics(task='regression', target_column="ugpa")
-    # reg_metrics = reg_efficacy.evaluate(df_imputed, synthetic_data)
-    # print("=== Regression Efficacy Metrics ===  UGPA")
-    # print(reg_metrics)
-
     clf_efficacy = EfficacyMetrics(task='classification', target_column="bar")
     clf_metrics = clf_efficacy.evaluate(df_imputed, synthetic_data)
     print("=== Classification Efficacy Metrics === BAR")
     print(clf_metrics)
 
-
     dp = DisclosureProtection(df_imputed, synthetic_data)
     dp_score = dp.score()
     dp_report = dp.report()
 
+    dp_report_df = pd.DataFrame(dp_report, index=[0])
+
     print("=== Disclosure Protection ===")
     print(f"Score: {dp_score:.3f}")
     print("Detailed Report:", dp_report)
@@ -296,12 +294,26 @@ def run():
         'combined_data' : combined_data.to_json(orient="records"),
         'realCorrelations': df_encoded.corr().to_json(orient="records"),
         'synthDataCorrelations': synth_df_encoded.corr().to_json(orient="records"),
-        'reports' : [            
+        'reports' : [ 
             {
                 'reportType': 'heading',
-                'headingKey': 'syntheticData.cartModelTitle' if sdgMethod == 'cart' else 'syntheticData.gaussianCopulaModelTitle'
+                'headingKey': 'syntheticData.handlingMissingDataTitle' 
+            },
+            {
+                'reportType': 'text',
+                'textKey': 'syntheticData.handlingMissingDataDescription'
             },
-             {
+            {            
+                'reportType': 'table',
+                'titleKey': 'syntheticData.handlingMissingDataTableTitle',
+                'showIndex' : False,    
+                'data': missingness_dict_df.to_json(orient="records"),                                            
+            },
+            {
+                'reportType': 'heading',
+                'headingKey': 'syntheticData.cartModelTitle' if sdgMethod == 'cart' else 'syntheticData.gaussianCopulaModelTitle'
+            },           
+            {
                 'reportType': 'text',
                 'textKey': 'syntheticData.cartModelDescription' if sdgMethod == 'cart' else 'syntheticData.gaussianCopulaModelDescription'
             },
@@ -310,6 +322,10 @@ def run():
                 'headingKey': 'syntheticData.evaluationOfGeneratedDataTitle'
             },
             {'reportType': 'univariateDistributionSyntheticData'},
+            {
+                'reportType': 'heading',
+                'headingKey': 'syntheticData.diagnosticsReportTitle'
+            },
             {            
                 'reportType': 'table',
                 'titleKey': 'syntheticData.diagnosticsTitle',
@@ -319,7 +335,18 @@ def run():
                     'contentType' : 'correlationSyntheticData'
                 }]
             },
-            {'reportType': 'bivariateDistributionSyntheticData'}
+            {
+                'reportType': 'table',
+                'titleKey': 'syntheticData.disclosureProtectionTitle',
+                'showIndex' : False,
+                'data': dp_report_df.to_json(orient="records"),                
+            },
+            {
+                'reportType': 'heading',
+                'headingKey': 'syntheticData.bivariateDistributionSyntheticDataTitle'
+            },
+            {'reportType': 'bivariateDistributionSyntheticData'},
+            
         ]
     }))
 
@@ -334,6 +361,12 @@ def run():
         'data': synthetic_data.head().to_json(orient="records")
     }))
 
+
+    setResult(json.dumps({
+        'type': 'heading',
+        'headingKey': 'syntheticData.moreInfoTitle'
+    }))
+
     setResult(json.dumps({
         'type': 'text',
         'key': 'syntheticData.moreInfo'
diff --git a/src/locales/en.json b/src/locales/en.json
@@ -125,13 +125,20 @@
             "heading": "0. Preview of real data"
         },
         "columnsInDataset": "1. Data types detection",
+        "handlingMissingDataTitle": "2. Handling missing data",
+        "handlingMissingDataDescription": "Handling missing data description",
+        "handlingMissingDataTableTitle": "Columns with missing data",
         "_explanatoryDataAnalysisTitle": "2. Explanatory data analysis",
-        "cartModelTitle": "2. Method: CART model",
-        "gaussianCopulaModelTitle": "2. Method: Gaussian Copula model",
+        "cartModelTitle": "3. Method: CART model",
+        "gaussianCopulaModelTitle": "3. Method: Gaussian Copula model",
         "cartModelDescription": "The CART (Classification and Regression Trees) method generates synthetic data by learning patterns from real data through a decision tree that splits data into homogeneous groups based on feature values. It predicts averages for numerical data and assigns the most common category for categorical data, using these predictions to create new synthetic points.",
-        "evaluationOfGeneratedDataTitle": "3. Evaluation of generated data",
-        "outputDataTitle": "4. Generated synthetic data",
+        "evaluationOfGeneratedDataTitle": "4. Evaluation of generated data",
+        "diagnosticsReportTitle": "5. Diagnostic Report",
         "diagnosticsTitle": "Diagnostic Results",
+        "disclosureProtectionTitle": "Disclosure protection",
+        "bivariateDistributionSyntheticDataTitle": "6. Bivariate distributions",
+        "outputDataTitle": "7. Generated synthetic data",
+        "moreInfoTitle": "8. More information",
         "correlationDifference": "Correlation difference: {{correlationDifference}}",
         "univariateText": "{{samples}} synthetic data points are generated using CART. The figures below display the value frequency for each variable. The synthetic data is of high quality when the frequencies are approximately the same.",
         "bivariateText": "The figures below display the differences in value frequency for a combination of variables. For comparing two categorical variables, bar charts are plotted. For comparing a numerical and a categorical variables, a so called [violin plot](https://en.wikipedia.org/wiki/Violin_plot) is shown. For comparing two numercial variables, a [LOESS plot](https://en.wikipedia.org/wiki/Local_regression) is created. For all plots holds: the synthetic data is of high quality when the shape of the distributions in the synthetic data equal the distributions in the real data.",
diff --git a/src/locales/nl.json b/src/locales/nl.json
@@ -125,13 +125,20 @@
             "heading": "0. Preview van de data"
         },
         "columnsInDataset": "1. Detectie van datatypes",
-        "_explanatoryDataAnalysisTitle": "2. Explanatory data analyse",
+        "handlingMissingDataTitle": "2. Handling missing data",
+        "handlingMissingDataDescription": "Handling missing data description",
+        "handlingMissingDataTableTitle": "Columns with missing data",
+        "_explanatoryDataAnalysisTitle": "3. Explanatory data analyse",
         "cartModelTitle": "2. Methode: CART model",
-        "gaussianCopulaModelTitle": "2. Methode: Gaussian Copula model",
+        "gaussianCopulaModelTitle": "3. Methode: Gaussian Copula model",
         "cartModelDescription": "De CART-methode (Classification and Regression Trees) genereert synthetische data door patronen uit echte data te leren via een beslisboom die de data opdeelt in homogene groepen op basis van kenmerkwaarden. Voor numerieke data voorspelt de methode gemiddelden, en voor categorische data wijst het de meest voorkomende categorie toe. Deze voorspellingen worden vervolgens gebruikt om nieuwe synthetische gegevenspunten te creëren.",
-        "evaluationOfGeneratedDataTitle": "3. Evaluatie van gegenereerde data",
-        "outputDataTitle": "4. Output data",
+        "evaluationOfGeneratedDataTitle": "4. Evaluatie van gegenereerde data",
+        "diagnosticsReportTitle": "6. Diagnostisch rapport",
         "diagnosticsTitle": "Diagnostische Resultaten",
+        "disclosureProtectionTitle": "Disclosure protection",
+        "bivariateDistributionSyntheticDataTitle": "6. Bivariate distributies",
+        "outputDataTitle": "7. Output data",
+        "moreInfoTitle": "8. Meer informatie",
         "correlationDifference": "Correlatie verschil: {{correlationDifference}}",
         "moreInfo": "&nbsp;&nbsp;\n  \n  \n  \nWil je meer weten over synthetische data?\n  \n  \n  \n- [python-synthpop op Github](https://github.com/NGO-Algorithm-Audit/python-synthpop)\n- [local-first web app op Github](https://github.com/NGO-Algorithm-Audit/local-first-web-tool/tree/main)\n- [Synthetische Data: wat, waarom en hoe?](https://royalsociety.org/-/media/policy/projects/privacy-enhancing-technologies/Synthetic_Data_Survey-24.pdf)\n- [Kennis Netwerk Synthetische Data](https://online.rijksinnovatiecommunity.nl/groups/399-kennisnetwerk-synthetischedata/welcome) (for Dutch public organizations)\n- [Synthetische data portaal van DUO](https://duo.nl/open_onderwijsdata/footer/synthetische-data.jsp)\n- [CART: synthpop resources](https://synthpop.org.uk/resources.html)\n- [Gaussian Copula - Synthetic Data Vault](https://docs.sdv.dev/sdv)"
     },