Merge pull request #66 from NGO-Algorithm-Audit/feature/ui-texts

devhelpr · web-flow · commit e48e8a2476b6 · 2025-03-25T19:00:07.000+01:00
Feature/UI texts
diff --git a/src/assets/synthetic-data.tsx b/src/assets/synthetic-data.tsx
@@ -7,7 +7,7 @@ import warnings
 import scipy.stats as stats
 from scipy.stats import norm, ks_2samp
 from sklearn.preprocessing import LabelEncoder
-from synthpop import MissingDataHandler, DataProcessor, CARTMethod
+from synthpop import MissingDataHandler, DataProcessor, CARTMethod, GaussianCopulaMethod
 from synthpop.metrics import (
     MetricsReport,
     EfficacyMetrics,
@@ -177,11 +177,16 @@ def run():
     # Preprocess the data: transforms raw data into a numerical format
     processed_data = processor.preprocess(df_imputed)    
 
-    cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
-    cart.fit(processed_data)
-    
-    synthetic_processed = cart.sample(samples)
-    
+    if (sdgMethod == 'cart'):
+        cart = CARTMethod(metadata, smoothing=True, proper=True, minibucket=5, random_state=42)
+        cart.fit(processed_data)
+        synthetic_processed = cart.sample(samples)
+
+    if (sdgMethod == 'gc'):
+        gc = GaussianCopulaMethod(metadata)
+        gc.fit(processed_data)
+        synthetic_processed = gc.sample(samples)
+
     print("synthetic_processed (first 5 rows):", synthetic_processed.head())
 
     synthetic_data = processor.postprocess(synthetic_processed)
@@ -225,25 +230,25 @@ def run():
         # spop.fit(real_data, dtypes=dtypes_dict)
         # synthetic_data = spop.generate(k=samples)
 
-    if (sdgMethod == 'gc'):
+    # if (sdgMethod == 'gc'):
         # Initialize synthesizer and fit it to the data
-        synthesizer = GaussianCopulaSynthesizer()
+        # synthesizer = GaussianCopulaSynthesizer()
         
         # Handle NaN values based on the selected treatment method
-        if nanTreatment == 'drop':
-            df_imputed = df_imputed.dropna()
-        elif nanTreatment == 'impute':
+        # if nanTreatment == 'drop':
+        #    df_imputed = df_imputed.dropna()
+        # elif nanTreatment == 'impute':
             # Use mean imputation for numerical columns and mode imputation for categorical columns
-            for column in df_imputed.columns:
-                if column_dtypes[column] == 'categorical':
-                    df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
-                else:
-                    df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
+        #    for column in df_imputed.columns:
+        #        if column_dtypes[column] == 'categorical':
+        #            df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mode()[0])
+        #        else:
+        #            df_imputed[column] = df_imputed[column].fillna(df_imputed[column].mean())
         
-        synthesizer.fit(df_imputed)
+        #synthesizer.fit(df_imputed)
 
         # Generate synthetic data
-        synthetic_data = synthesizer.sample(samples)
+        # synthetic_data = synthesizer.sample(samples)
 
     synth_df_decoded = synthetic_data.copy()
 
@@ -358,16 +363,24 @@ def run():
             {
                 'reportType': 'heading2',
                 'headingKey': 'syntheticData.diagnosticsReportTitle'
-            },
+            },            
             {            
                 'reportType': 'table',
                 'titleKey': 'syntheticData.diagnosticsTitle',
-                'showIndex' : False,    
+                'showIndex' : False,   
+                'preContent' : [{
+                    'contentType': 'text',
+                    'textKey': 'syntheticData.diagnosticsReportDescription'    
+                }],
                 'data': report_df.to_json(orient="records"),                                           
             },
             {            
                 'reportType': 'correlationSyntheticData',
-                'titleKey': 'syntheticData.correlationMatrixTitle',                
+                'titleKey': 'syntheticData.correlationMatrixTitle',
+                'preContent' : [{
+                    'contentType': 'text',
+                    'textKey': 'syntheticData.correlationMatrixDescription'    
+                }],                    
             },
             {
                 'reportType': 'table',
diff --git a/src/components/DistributionReport.tsx b/src/components/DistributionReport.tsx
@@ -137,7 +137,7 @@ export const DistributionReport = (
                             return null;
                         }
                         const preContent: additionalContent = report.preContent
-                            ? JSON.parse(report.preContent)
+                            ? (report.preContent as unknown as additionalContent)
                             : [];
                         const postContent: additionalContent =
                             (report.postContent as unknown as additionalContent) ??
@@ -255,7 +255,7 @@ export const DistributionReport = (
                             return null;
                         }
                         const preContent: additionalContent = report.preContent
-                            ? JSON.parse(report.preContent)
+                            ? (report.preContent as unknown as additionalContent)
                             : [];
                         const postContent: additionalContent =
                             (report.postContent as unknown as additionalContent) ??
diff --git a/src/components/componentMapper.tsx b/src/components/componentMapper.tsx
@@ -128,7 +128,7 @@ export default function ComponentMapper({
 
                     case 'text':
                         return (
-                            <TooltipProvider>
+                            <TooltipProvider key={index}>
                                 <MarkdownWithTooltips
                                     key={index}
                                     className="-mt-2 text-gray-800 markdown"
diff --git a/src/locales/en.ts b/src/locales/en.ts
@@ -154,7 +154,39 @@ export const en = {
         distributionsTitle: '4.1 Distributions',
         diagnosticsReportTitle: '4.2. Diagnostic Report',
         diagnosticsTitle: 'Diagnostic Results',
+        diagnosticsReportDescription: `For each column, diagnostic results are computed for the quality of the generated synthetic data. The computed metrics depend on the type of data.
+
+For numerical (or datetime) columns the following metrics are computed:
+- {tooltip:syntheticData.missingValueSimilarity}Missing value similarity{/tooltip}
+- {tooltip:syntheticData.rangeCoverage}Range coverage{/tooltip}
+- {tooltip:syntheticData.boundaryAdherenc}Boundary adherence{/tooltip}
+- {tooltip:syntheticData.statisticSimilarity}Statistic similarity{/tooltip}
+- {tooltip:syntheticData.kolmogorovSmirnovComplement}Kolmogorov–Smirnov (KS) complement{/tooltip}
+
+For categorical (or boolean) columns the following metrics are computed:
+- {tooltip:syntheticData.missingValueSimilarity}Missing value similarity{/tooltip}
+- {tooltip:syntheticData.categoryCoverage}Category coverage{/tooltip}
+- {tooltip:syntheticData.categoryAdherence}Category adherence{/tooltip}
+- {tooltip:syntheticData.totalVariationComplement}Total variation (TV) complement{/tooltip}`,
+        missingValueSimilarity:
+            'Compares whether the synthetic data has the same proportion of missing values as the real data for a given column',
+        rangeCoverage:
+            'Measures whether a synthetic column covers the full range of values that are present in a real column',
+        boundaryAdherenc:
+            'Measures whether a synthetic column respects the minimum and maximum values of the real column. It returns the percentage of synthetic rows that adhere to the real boundaries',
+        statisticSimilarity:
+            'Measures the similarity between real column and a synthetic column by comparing the mean, standard deviation and median',
+        kolmogorovSmirnovComplement:
+            'Computes the similarity of a real and synthetic numerical column in terms of the column shapes, i.e., the marginal distribution or 1D histogram of the column.',
+        categoryCoverage:
+            'Measures whether a synthetic column covers all the possible categories that are present in a real column',
+        categoryAdherence:
+            'Measures whether a synthetic column adheres to the same category values as the real data',
+        totalVariationComplement:
+            'Computes the similarity of a real and synthetic categorical column in terms of the column shapes, i.e., the marginal distribution or 1D histogram of the column.',
         correlationMatrixTitle: 'Correlation matrix',
+        correlationMatrixDescription: `The matrix below illustrates the differences in pairwise correlations between variables in the original and synthetic data. 
+Green cells signify that the pairwise correlation was accurately captured, with 0 representing the best possible score. Red cells indicate poor capture of the pairwise correlation.`,
         efficacyMetricsTitle: 'Efficacy metrics',
         disclosureProtectionTitle: 'Privacy metrics',
         outputDataTitle: '5. Generated synthetic data',