Merge pull request #106 from NGO-Algorithm-Audit/feature/fixes-11-jun-ubdt-2

devhelpr · web-flow · commit 1d20a4d81698 · 2025-06-12T20:54:26.000+02:00
Feature/fixes 11 jun ubdt 2
diff --git a/src/assets/bias-detection-python-code.tsx b/src/assets/bias-detection-python-code.tsx
@@ -447,74 +447,69 @@ def run():
         print("The most biased cluster has a significantly higher average bias metric than the rest of the dataset.")
     else:
         print("No significant difference in average bias metric between the most biased cluster and the rest of the dataset.")
-
-    setResult(json.dumps({
-        'type': 'heading',
-        'headingKey': 'biasAnalysis.distribution.mainHeading'
-    }))
-        
-
+    
     # visualize the clusters
 
     # Group by cluster_label and count the occurrences
     cluster_counts = decoded_X_test["cluster_label"].value_counts()
     print(f"cluster_counts: {cluster_counts}")
 
+    if p_val < 0.05:
 
-    if localDataType == 'numeric':
-        # Calculate mean per cluster for each variable
-        means = test_df.groupby("cluster_label").mean()
-
-        # Calculate overall mean for each variable (excluding cluster_label)
-        variables = X_test.columns.tolist()
-        overall_means = test_df[variables].mean()
-
-        dropdownCategories = []
-        for i, column in enumerate(X_test.columns):
-            if column != bias_score:
-                dropdownCategories.append(column)
-
-        # Plot bar charts for each variable, showing means for each cluster and overall mean as red line
-        n_vars = len(variables)
-        n_cols = 2
-        n_rows = int(np.ceil(n_vars / n_cols))
-
-        charts = []
+        setResult(json.dumps({
+            'type': 'heading',
+            'headingKey': 'biasAnalysis.distribution.mainHeading'
+        }))
+        
+        if localDataType == 'numeric':
+            # Calculate mean per cluster for each variable
+            means = test_df.groupby("cluster_label").mean()
 
-        for i, var in enumerate(variables):
-                        
-            print(f"means: {var}")
-            print(overall_means[var])
-            print(means[var])
-            print(f"========================")
-            
+            # Calculate overall mean for each variable (excluding cluster_label)
+            variables = X_test.columns.tolist()
+            overall_means = test_df[variables].mean()
 
-            charts.append({
-                    'yAxisLabel': 'distribution.frequency',
-                    'type': 'clusterNumericalVariableDistribution',
-                    'headingKey': 'biasAnalysis.distribution.heading',  
-                    'title': var,
-                    'meanValue': overall_means[var],
-                    'data': means[var].to_json(orient='records'),
-                    'params': {'variable': var},
-                    'selectFilterGroup' : var,
-                    'defaultFilter': X_test.columns[0]
-                })
+            dropdownCategories = []
+            for i, column in enumerate(X_test.columns):
+                if column != bias_score:
+                    dropdownCategories.append(column)
+
+            # Plot bar charts for each variable, showing means for each cluster and overall mean as red line
+            n_vars = len(variables)
+            n_cols = 2
+            n_rows = int(np.ceil(n_vars / n_cols))
+
+            charts = []
+
+            for i, var in enumerate(variables):
+                            
+                print(f"means: {var}")
+                print(overall_means[var])
+                print(means[var])
+                print(f"========================")
+                
 
-        setResult(json.dumps({
-                'type': 'clusterNumericalVariableDistributionAccordeon',
-                'clusterCount': clusterCount,
-                'charts': charts,
-                'values': dropdownCategories,
-                'titleKey': "biasAnalysis.numericalVariableDistributionAcrossClustersAccordeonTitle",
-                'defaultValue': X_test.columns[0]
-            }))
- 
-    if p_val < 0.05:
+                charts.append({
+                        'yAxisLabel': 'distribution.frequency',
+                        'type': 'clusterNumericalVariableDistribution',
+                        'headingKey': 'biasAnalysis.distribution.heading',  
+                        'title': var,
+                        'meanValue': overall_means[var],
+                        'data': means[var].to_json(orient='records'),
+                        'params': {'variable': var},
+                        'selectFilterGroup' : var,
+                        'defaultFilter': X_test.columns[0]
+                    })
 
-        if localDataType == 'numeric':
-            # see above for the code 
-            print("Statistically significant differences in means found.")
+            setResult(json.dumps({
+                    'type': 'clusterNumericalVariableDistributionAccordeon',
+                    'clusterCount': clusterCount,
+                    'charts': charts,
+                    'values': dropdownCategories,
+                    'titleKey': "biasAnalysis.numericalVariableDistributionAcrossClustersAccordeonTitle",
+                    'defaultValue': X_test.columns[0]
+                }))
+        
         else:
             # Create subplots for each column
             columns_to_analyze = [col for col in decoded_X_test.columns if col not in [bias_score, "cluster_label"]]
@@ -577,43 +572,44 @@ def run():
     setOutputData("mostBiasedCluster", df_most_biased_cluster.to_json(orient='records'))
     setOutputData("otherClusters", df_other.to_json(orient='records'))
 
-
+    
     setResult(json.dumps({
         'type': 'heading',
         'headingKey': 'biasAnalysis.conclusion'
     }))
 
-    setResult(json.dumps({
-        'type': 'text',
-        'key': 'biasAnalysis.conclusionDescription'
-    }))
+    
         
     # Calculate the difference in percentage for each category value between cluster 0 and the entire dataset
     diff_percentages = {}
 
     # Select only cluster 0
     cluster_0 = decoded_X_test[decoded_X_test["cluster_label"] == 0]
 
-    if (localDataType == 'numeric'):
-        
-        comparisons = t_test_on_cluster(test_df, bias_score, cluster_label=0)
-
+    if p_val < 0.05:
         setResult(json.dumps({
-            'type': 'accordion',
-            'titleKey': 'biasAnalysis.biasedCluster.accordionTitle',
-            'comparisons': comparisons
+            'type': 'text',
+            'key': 'biasAnalysis.conclusionDescription'
         }))
-    else:
-        comparisons = chi2_test_on_cluster(decoded_X_test, bias_score, cluster_label=0)
     
-        setResult(json.dumps({
-            'type': 'accordion',
-            'titleKey': 'biasAnalysis.biasedCluster.accordionTitle',
-            'comparisons': comparisons,
-            'className': 'biasAnalysis-biasedClusterAccordion'
-        }))
-
+        if (localDataType == 'numeric'):
+            
+            comparisons = t_test_on_cluster(test_df, bias_score, cluster_label=0)
 
+            setResult(json.dumps({
+                'type': 'accordion',
+                'titleKey': 'biasAnalysis.biasedCluster.accordionTitle',
+                'comparisons': comparisons
+            }))
+        else:
+            comparisons = chi2_test_on_cluster(decoded_X_test, bias_score, cluster_label=0)
+        
+            setResult(json.dumps({
+                'type': 'accordion',
+                'titleKey': 'biasAnalysis.biasedCluster.accordionTitle',
+                'comparisons': comparisons,
+                'className': 'biasAnalysis-biasedClusterAccordion'
+            }))
     
 
     setResult(json.dumps({
diff --git a/src/assets/synthetic-data.tsx b/src/assets/synthetic-data.tsx
@@ -168,7 +168,8 @@ def run():
 
     setResult(json.dumps({
         'type': 'list',
-        'list': dataInfo
+        'list': dataInfo,
+        'translateValue' : True,
     }))
         
     setResult(json.dumps({
@@ -255,11 +256,7 @@ def run():
             {
                 'reportType': 'heading',
                 'headingKey': 'syntheticData.handlingMissingDataTitle' 
-            },
-            {
-                'reportType': 'text',
-                'textKey': 'syntheticData.handlingMissingDataDescription'
-            },            
+            },                       
             {            
                 'reportType': 'table',
                 'titleKey': 'syntheticData.handlingMissingDataTableTitle',
@@ -278,7 +275,7 @@ def run():
             },           
             {
                 'reportType': 'text',
-                'textKey': 'syntheticData.cartModelDescription' if sdgMethod == 'cart' else 'syntheticData.gaussianCopulaModelDescription',
+                'textKey': 'syntheticData.cartModelDescription' if sdgMethod == 'cart' else 'syntheticData.gcModelDescription',
                 'params': {
                     'samples': samples,
                 }
diff --git a/src/components/SyntheticDataSettings.tsx b/src/components/SyntheticDataSettings.tsx
@@ -305,9 +305,17 @@ export default function SyntheticDataSettings({
                         </div>
 
                         <div className="grid gap-3">
-                            <Label htmlFor="samples">
+                            <Label
+                                htmlFor="samples"
+                                className="flex flex-row items-center gap-1"
+                            >
                                 {t('syntheticData.form.fieldset.samples')} (
                                 {outputSamples})
+                                <IconInfoTooltip
+                                    tooltipText={t(
+                                        'syntheticData.form.fieldset.outputSamplesTooltip'
+                                    )}
+                                />
                             </Label>
                             <Slider
                                 id="samples"
diff --git a/src/components/componentMapper.tsx b/src/components/componentMapper.tsx
@@ -82,7 +82,7 @@ export default function ComponentMapper({
                                 ) => (
                                     <li key={`list-${index}`}>
                                         <span className="font-bold">{`${listItem.key}`}</span>
-                                        {`: ${listItem.value}`}
+                                        {`: ${resultItem.translateValue ? t(listItem.value) : listItem.value}`}
                                     </li>
                                 )
                             );
diff --git a/src/components/pyodide/use-python.ts b/src/components/pyodide/use-python.ts
@@ -52,7 +52,7 @@ export const usePython = <T, TExport>(emptyParams: T) => {
     const runPython = useCallback(
         (message: { type: 'start'; params: { parameters: T } }) => {
             setLoading(true);
-            setLoadingMessage('running analysis');
+            setLoadingMessage('runningAnalysis');
 
             setClusterInfo(undefined);
             setResult([]);
diff --git a/src/locales/en.ts b/src/locales/en.ts
@@ -20,6 +20,9 @@ export const en = {
     loadingPackages:
         'Loading core packages. On average this takes 10-15 seconds.',
     installingPackages: 'Installing additional packages...',
+    runningAnalysis: 'Running analysis...',
+    categorical: 'Categorical',
+    numerical: 'Numerical',
     biasSettings: {
         dataType: {
             numeric: 'Numeric',
@@ -73,9 +76,9 @@ export const en = {
                 analysisError: 'Error while analysing',
                 noData: 'No data loaded',
                 numericDataRequired:
-                    'Selected column must contain numerical data for k-means clustering.',
+                    'Not all data have same format, please change locally before attaching the data',
                 categoricalDataRequired:
-                    'Selected column must contain categorical data for k-modes clustering.',
+                    'Not all data have same format, please change locally before attaching the data.',
             },
             actions: {
                 tryItOut: 'Demo dataset',
@@ -149,6 +152,8 @@ export const en = {
                         "When using Gaussian Copula, you can choose how to handle missing values (NaN values) in your dataset. 'Drop rows with NaN values' removes them completely, while 'Imputate NaN values' replaces them with mean values for numerical columns and mode values for categorical columns",
                 },
                 samples: 'Number of synthetic datapoints',
+                outputSamplesTooltip:
+                    'Number of synthetic data points to be generated by the tool. Due to computational contstraints of browser-based synthetic data generation, the maximum is set to 5.000.',
             },
             actions: {
                 tryItOut: 'Demo dataset',
@@ -181,6 +186,7 @@ export const en = {
         gaussianCopulaModelTitle: '3. Method: Gaussian Copula model',
         cartModelDescription:
             'The CART (Classification and Regression Trees) method generates synthetic data by learning patterns from real data through a decision tree that splits data into homogeneous groups based on feature values. It predicts averages for numerical data and assigns the most common category for categorical data, using these predictions to create new synthetic points.\n \n {{samples}} synthetic data points are generated.',
+        gcModelDescription: `Gaussian Copula works in two main steps: 1. The real data is transformed into a uniform distribution. Correlations between variables are modeled using a multivariate normal distribution (the Gaussian copula); and 2. Synthetic data is created by sampling from this Gaussian copula and transforming the samples back to the original data distributions.\n \n {{samples}} synthetic data points are generated.`,
         evaluationOfGeneratedDataTitle:
             '4. Evaluation of generated synthetic data',
         distributionsTitle: '4.1 Distributions',
diff --git a/src/locales/nl.ts b/src/locales/nl.ts
@@ -17,6 +17,9 @@ export const nl = {
     loadingPyodide: 'Python omgeving laden...',
     loadingPackages: 'Laden van packages. Dit duurt gemiddeld 10-15 seconden.',
     installingPackages: 'Aanvullende packages laden',
+    runningAnalysis: 'Analyse uitvoeren...',
+    categorical: 'Categorisch',
+    numerical: 'Numeriek',
     biasSettings: {
         dataType: {
             numeric: 'Numeriek',
@@ -71,9 +74,9 @@ export const nl = {
                 analysisError: 'Fout tijdens analyse',
                 noData: 'Geen gegevens geladen',
                 numericDataRequired:
-                    'Geselecteerde kolom moet numerieke data bevatten voor k-means clustering.',
+                    'Niet alle data hebben hetzelfde formaat, pas dit lokaal aan voordat je de data toevoegt.',
                 categoricalDataRequired:
-                    'Geselecteerde kolom moet categorische data bevatten voor k-modes clustering.',
+                    'Niet alle data hebben hetzelfde formaat, pas dit lokaal aan voordat je de data toevoegt.',
             },
             actions: {
                 tryItOut: 'Demo dataset',
@@ -149,6 +152,8 @@ export const nl = {
                         'Bij gebruik van Gaussian Copula kunt u kiezen hoe u omgaat met ontbrekende waarden (NaN waarden) in uw dataset. Het verwijderen van rijen met NaN waarden verwijdert deze volledig, terwijl imputatie deze vervangt door gemiddelde waarden voor numerieke kolommen en modus waarden voor categorische kolommen',
                 },
                 samples: 'Aantal synthetische datapunten',
+                outputSamplesTooltip:
+                    'Aantal synthetische datapunten die door de tool worden gegenereerd. Vanwege de rekencapaciteit van browser-gebaseerde datageneratie is het maximum ingesteld op 5.000.',
             },
             actions: {
                 tryItOut: 'Demo dataset',
@@ -183,6 +188,7 @@ export const nl = {
         gaussianCopulaModelTitle: '3. Methode: Gaussian Copula model',
         cartModelDescription:
             'De CART-methode (Classification and Regression Trees) genereert synthetische data door patronen uit echte data te leren via een beslisboom die de data opdeelt in homogene groepen op basis van kenmerken. Voor numerieke data voorspelt de methode gemiddelden en voor categorische data wijst het de meest voorkomende categorie toe. Deze voorspellingen worden vervolgens gebruikt om synthetische datapunten te creëren.\n \n {{samples}} synthetische datapunten zijn gegenereerd.',
+        gcModelDescription: `Gaussian Copula werkt in twee stappen: 1. De echte data worden getransformeerd naar een uniforme verdeling. Correlaties tussen variabelen worden gemodelleerd met een multivariate normale verdeling (de Gaussian copula); en 2. Synthetische data worden gegenereerd door te sampelen uit deze copula en de samples terug te transformeren naar de oorspronkelijke verdelingen.\n \n {{samples}} synthetische datapunten zijn gegenereerd.`,
         evaluationOfGeneratedDataTitle:
             '4. Evaluatie van gegenereerde synthetische data',
         distributionsTitle: '4.1 Distributies',

Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@ export default function ComponentMapper({`
`82`	`82`	`) => (`
`83`	`83`	<li key={`list-${index}`}>
`84`	`84`	<span className="font-bold">{`${listItem.key}`}</span>
`85`		- {`: ${listItem.value}`}
	`85`	+ {`: ${resultItem.translateValue ? t(listItem.value) : listItem.value}`}
`86`	`86`	`</li>`
`87`	`87`	`)`
`88`	`88`	`);`