Merge pull request #92 from NGO-Algorithm-Audit/JFP_edits

jfparie · web-flow · commit 698ea4575aa9 · 2025-06-09T00:18:54.000+02:00
Textual edits web app SDG NL EN
diff --git a/src/locales/en.ts b/src/locales/en.ts
@@ -97,7 +97,7 @@ export const en = {
             description:
                 'A subset of the [Law School Admission Bar](https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage)* dataset is used as a demo. Synthetic data will be generated for the following variables:',
             'post.description':
-                'The CART method is used to generate the synthetic data. CART generally produces high quality synthetic data, but might not work well on datasets with categorical variables with 20+ categories. Use Gaussian Copula in those cases.\n  \n*The original paper can be found [here](https://files.eric.ed.gov/fulltext/ED469370.pdf)',
+                '<br> The CART method is used to generate the synthetic data. CART generally produces high quality synthetic data, but might not work well on datasets with categorical variables with 20+ categories. Use Gaussian Copula in those cases.\n  \n*The original paper can be found [here](https://files.eric.ed.gov/fulltext/ED469370.pdf)',
             'data.column.Variable_name': 'Variable name',
             'data.sex': 'sex',
             'data.race1': 'race1',
@@ -131,33 +131,33 @@ export const en = {
                 columnsCountError: 'File may contain a maximum of 8 columns.',
             },
             fieldset: {
-                sourceDataset: 'Source data',
+                sourceDataset: 'Input',
                 sdgMethod: {
                     title: 'Method',
                     cart: 'CART',
                     gc: 'Gaussian Copula',
                     tooltip:
-                        'By default, the CART method is used to generate synthetic data. CART generally produces higher quality synthetic data, but might not work well on datasets with categorical variables with 20+ categories. Use Gaussian Copula in those cases.',
+                        'By default, the CART method is used to generate synthetic data. CART generally produces higher quality synthetic data, but might not work well on datasets with categorical variables with 20+ categories. Use Gaussian Copula in those cases',
                 },
                 nanTreatment: {
                     title: 'NaN values treatment',
                     drop: 'Drop rows with NaN values',
                     impute: 'Impute NaN values',
                     tooltip:
-                        'When using Gaussian Copula, you can choose how to handle missing values (NaN values) in your dataset. Dropping rows with NaN values removes them completely, while imputation replaces them with mean values for numerical columns and mode values for categorical columns.',
+                        "When using Gaussian Copula, you can choose how to handle missing values (NaN values) in your dataset. 'Drop rows with NaN values' removes them completely, while 'Imputate NaN values' replaces them with mean values for numerical columns and mode values for categorical columns",
                 },
                 samples: 'Number of synthetic datapoints',
             },
             actions: {
-                tryItOut: 'Try it out',
+                tryItOut: 'Demo dataset',
                 runGeneration: 'Run synthetic data generation',
                 analyzing: 'Analyzing...',
                 initializing: 'Initialising...',
             },
         },
         demoCard: {
             title: 'Try it out!',
-            description: 'No dataset at hand? Use our demo dataset.',
+            description: 'No dataset at hand? Use our demo dataset',
         },
         columnsInDatasetInfo:
             'If the detected data types are incorrect, please change this locally in the source dataset before attaching it to the web app.',
@@ -168,7 +168,7 @@ export const en = {
         correlationRealdata: 'Correlation matrix',
         correlationSyntheticData: 'Correlation matrix',
         dataSetPreview: {
-            heading: '1. Preview of data',
+            heading: '0. Preview of data',
         },
         columnsInDataset: '1. Data types detection',
         handlingMissingDataTitle: '2. Handling missing data',
@@ -221,25 +221,35 @@ For categorical (or boolean) columns the following metrics are computed:
         efficacyMetricsDescription: `Efficacy metrics comparing real and synthetic datasets for downstream predictive tasks. The idea is to train a predictive model on synthetic data and evaluate its performance on real data. The type of metrics computed depends on the task:
 
 For regression (when the target is numerical):
-- Mean Squared Error (MSE)
-- Mean Absolute Error (MAE)
-- R^2 Score
+- {tooltip:syntheticData.meanSquaredError}Mean squared error (MSE){/tooltip}
+- {tooltip:syntheticData.meanAbsoluteError}Mean Absolute Error (MAE){/tooltip}
+- {tooltip:syntheticData.R2}R² Score{/tooltip}
 
-For classification (when the target is categorical/boolean):
-- Accuracy Score
-- Weighted F1 Score`,
+For classification (when the target is categorical):
+- {tooltip:syntheticData.accuracyScore}Accuracy Score{/tooltip}
+- {tooltip:syntheticData.weightedF1Score}Weighted F1 Score{/tooltip}`,
         disclosureProtectionTitle: 'Privacy metrics',
-        disclosureProtectionDescription: `A class to compute the disclosure protection metric for synthetic data. This metric measures the proportion of synthetic records that are too similar (within a defined threshold) to real records, posing a disclosure risk.`,
-        outputDataTitle: '5. Generated synthetic data',
+        disclosureProtectionDescription: `The disclosure protection metric measures the proportion of synthetic data points that closely resemble real data points (within a predefined threshold), posing a risk of traceability to personal data. A low 'risk\_rate' and a high 'disclosure\_protection\_rate' indicate effective protection against the unintentional exposure of personal data.`,
+        outputDataTitle: '5. Download synthetic data and evaluation report',
         moreInfoTitle: '6. More information',
+        meanSquaredError:
+                    'Average squared difference between predicted and actual values, quantifying the accuracy of a model’s predictions by penalizing larger errors more heavily',
+        meanAbsoluteError:
+                    'Average magnitude of the errors between predicted and actual values, providing a straightforward assessment of model accuracy without emphasizing large errors',
+        R2:
+                    'Quantifies how well a model’s predictions match the actual data by measuring the proportion of variance in the target variable explained by the model',
+        accuracyScore:
+                    'Measures the proportion of correctly predicted instances out of the total instances, providing an overall assessment of a model’s performance in classification tasks',
+        weightedF1Score:
+                    'Harmonic mean of precision and recall, calculated for each class and weighted by the class’s support (number of true instances), providing a balanced performance measure for imbalanced datasets',
         correlationDifference:
             'Correlation difference: {{correlationDifference}}',
         univariateText:
-            '{{samples}} synthetic data points are generated using CART. The figures below display the value frequency for each variable. The synthetic data is of high quality when the frequencies are approximately the same.',
+            '<br>{{samples}} synthetic data points are generated using CART. The figures below display the value frequency for each variable. The synthetic data is of high quality when the frequencies are approximately the same.',
         bivariateText:
             'The figures below display the differences in value frequency for a combination of variables. For comparing two categorical variables, bar charts are plotted. For comparing a numerical and a categorical variables, a so called [violin plot](https://en.wikipedia.org/wiki/Violin_plot) is shown. For comparing two numercial variables, a [LOESS plot](https://en.wikipedia.org/wiki/Local_regression) is created. For all plots holds: the synthetic data is of high quality when the shape of the distributions in the synthetic data equal the distributions in the real data.',
         moreInfo:
-            'Do you want to learn more about synthetic data?\n  \n  \n  \n- [python-synthpop on Github](https://github.com/NGO-Algorithm-Audit/python-synthpop)\n- [local-first web app on Github](https://github.com/NGO-Algorithm-Audit/local-first-web-tool/tree/main)\n- [Synthetic Data: what, why and how?](https://royalsociety.org/-/media/policy/projects/privacy-enhancing-technologies/Synthetic_Data_Survey-24.pdf)\n- [Knowledge Network Synthetic Data](https://online.rijksinnovatiecommunity.nl/groups/399-kennisnetwerk-synthetischedata/welcome) (for Dutch public organizations)\n- [Synthetic data portal of Dutch Executive Agency for Education](https://duo.nl/open_onderwijsdata/footer/synthetische-data.jsp) (DUO)\n- [CART: synthpop resources](https://synthpop.org.uk/resources.html)\n- [Gaussian Copula - Synthetic Data Vault](https://docs.sdv.dev/sdv)',
+            'Do you want to learn more about synthetic data?\n  \n  \n  \n- [python-synthpop on Github](https://github.com/NGO-Algorithm-Audit/python-synthpop)\n- [local-first web app on Github](https://github.com/NGO-Algorithm-Audit/local-first-web-tool/tree/main)\n- [Synthetic Data: what, why and how?](https://royalsociety.org/-/media/policy/projects/privacy-enhancing-technologies/Synthetic_Data_Survey-24.pdf)\n- [Knowledge Network Synthetic Data](https://online.rijksinnovatiecommunity.nl/groups/399-kennisnetwerk-synthetischedata/welcome) (Dutch public organizations)\n- [Synthetic data portal of Dutch Executive Agency for Education](https://duo.nl/open_onderwijsdata/footer/synthetische-data.jsp) (DUO)\n- [CART: synthpop resources](https://synthpop.org.uk/resources.html)\n- [Gaussian Copula - Synthetic Data Vault](https://docs.sdv.dev/sdv)',
         missingData: `For {tooltip:syntheticData.missingDataMARTooltip}Missing At Random (MAR){/tooltip} and {tooltip:syntheticData.missingDataMNARTooltip}Missing Not At Random (MNAR){/tooltip} data, 
 we recommend to impute the missing data. For {tooltip:syntheticData.missingDataMCARTooltip}Missing Completely At Random (MCAR){/tooltip}, we recommend to remove the missing data.`,
         missingDataMARTooltip: `**MAR (Missing At Random)**:
diff --git a/src/locales/nl.ts b/src/locales/nl.ts
@@ -94,7 +94,7 @@ export const nl = {
             description:
                 'Een subset van de [Law School Admission Bar](https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage)* dataset wordt gebruikt als demo. Synthetische data worden gegenereerd voor de volgende variablen:\n  \n&nbsp;&nbsp;\n\n',
             'post.description':
-                'De CART-methode wordt gebruikt om synthetische gegevens te genereren.\n CART produceert doorgaan een goede kwaliteit synthetische data, maar werkt minder goed voor data met categorische data met meer dan 20 categorieën. Gebruik in dit geval Gaussian Copula. \n&nbsp;&nbsp;\n\n*Het oorspronkelijke artikel is [hier](https://files.eric.ed.gov/fulltext/ED469370.pdf) te vinden.',
+                '<br>De CART-methode wordt gebruikt om synthetische gegevens te genereren.\n CART produceert doorgaan een goede kwaliteit synthetische data, maar werkt minder goed voor data met categorische data met meer dan 20 categorieën. Gebruik in dit geval Gaussian Copula. \n&nbsp;&nbsp;\n\n*Het oorspronkelijke artikel is [hier](https://files.eric.ed.gov/fulltext/ED469370.pdf) te vinden.',
             'data.column.Variable_name': 'Variabele name',
             'data.sex': 'sex',
             'data.race1': 'race1',
@@ -131,7 +131,7 @@ export const nl = {
                 columnsCountError: 'File mag maximaal 8 kolommen bevatten.',
             },
             fieldset: {
-                sourceDataset: 'Brondata',
+                sourceDataset: 'Input',
                 sdgMethod: {
                     title: 'Methode',
                     cart: 'CART',
@@ -149,17 +149,17 @@ export const nl = {
                 samples: 'Aantal synthetische datapunten',
             },
             actions: {
-                tryItOut: 'Uitproberen',
+                tryItOut: 'Demo dataset',
                 runGeneration: 'Start synthetische data generatie',
                 analyzing: 'Analyseren...',
                 initializing: 'Initialiseren...',
             },
             univariateText:
-                '{{samples}} synthetic datapunten via de CART-methode gegeneerd. De grafieken tonen de frequentie waarmee een variabele een bepaalde waarde aanneemt. De synthetische data is van hoge kwaliteit als de frequenties ongeveer gelijke zijn.',
+                '<br> {{samples}} synthetic datapunten via de CART-methode gegeneerd. De grafieken tonen de frequentie waarmee een variabele een bepaalde waarde aanneemt. De synthetische data is van hoge kwaliteit als de frequenties ongeveer gelijke zijn.',
         },
         demoCard: {
             title: 'Probeer het uit!',
-            description: 'Geen dataset bij de hand? Gebruik onze demodata.',
+            description: 'Geen dataset bij de hand? Gebruik onze demo dataset',
         },
         columnsInDatasetInfo:
             'Als de gedetecteerd data types niet correct zijn, pas dit dan lokaal aan in de dataset voordat u deze opnieuw aan de app koppelt.',
@@ -188,11 +188,22 @@ export const nl = {
         correlationMatrixTitle: 'Correlatie matrix',
         efficacyMetricsTitle: 'Doeltreffendheid metrieken',
         disclosureProtectionTitle: 'Privacy metrieken',
-        outputDataTitle: '5. Output data',
+        disclosureProtectionDescription: `De onthullings beschermings metriek meet het aandeel synthetische datapunten die te veel lijkt op echte datapunten (binnen een vooraf gedefinieerde drempelwaarde), wat een risico op herleidbaarheid naar persoonsgegevens vormt. Een lage 'risk_rate' en hoge 'disclosure_protection_rate' duidt op een goede bescherming tegen het onbedoeld prijsgeven van persoonsgegevens.`,
+        outputDataTitle: '5. Download synthetische data en evaluatierapport',
         moreInfoTitle: '6. Meer informatie',
+        meanSquaredError:
+                    'Gemiddeld kwadraatverschil tussen voorspelde en werkelijke waarden, dat de nauwkeurigheid van de voorspellingen van een model kwantificeert door grotere fouten zwaarder te bestraffen',
+        meanAbsoluteError:
+                    'Gemiddelde grootte van de fouten tussen voorspelde en werkelijke waarden, die een eenvoudige beoordeling van de nauwkeurigheid van het model biedt zonder de nadruk te leggen op grote fouten',
+        R2:
+                    'Kwantificeert hoe goed de voorspellingen van een model overeenkomen met de werkelijke gegevens door het aandeel van de variantie in de doelvariabele te meten dat door het model wordt verklaard',
+        accuracyScore:
+                    'Meet het aandeel correct voorspelde gevallen ten opzichte van het totaal, en geeft zo een algemene beoordeling van de prestaties van het classificatiemodel',
+        weightedF1Score:
+                    'Het harmonisch gemiddelde van precisie en recall, berekend per klasse en gewogen naar het aantal echte gevallen per klasse, wat een metriek biedt voor datasets met ongelijke klassenverdeling',
         correlationDifference: 'Correlatie verschil: {{correlationDifference}}',
         moreInfo:
-            '&nbsp;&nbsp;\n  \n  \n  \nWil je meer weten over synthetische data?\n  \n  \n  \n- [python-synthpop op Github](https://github.com/NGO-Algorithm-Audit/python-synthpop)\n- [local-first web app op Github](https://github.com/NGO-Algorithm-Audit/local-first-web-tool/tree/main)\n- [Synthetische Data: wat, waarom en hoe?](https://royalsociety.org/-/media/policy/projects/privacy-enhancing-technologies/Synthetic_Data_Survey-24.pdf)\n- [Kennis Netwerk Synthetische Data](https://online.rijksinnovatiecommunity.nl/groups/399-kennisnetwerk-synthetischedata/welcome) (for Dutch public organizations)\n- [Synthetische data portaal van DUO](https://duo.nl/open_onderwijsdata/footer/synthetische-data.jsp)\n- [CART: synthpop resources](https://synthpop.org.uk/resources.html)\n- [Gaussian Copula - Synthetic Data Vault](https://docs.sdv.dev/sdv)',
+            '&nbsp;&nbsp;\n  \n  \n  \nWil je meer weten over synthetische data?\n  \n  \n  \n- [python-synthpop op Github](https://github.com/NGO-Algorithm-Audit/python-synthpop)\n- [local-first web app op Github](https://github.com/NGO-Algorithm-Audit/local-first-web-tool/tree/main)\n- [Synthetische Data: wat, waarom en hoe?](https://royalsociety.org/-/media/policy/projects/privacy-enhancing-technologies/Synthetic_Data_Survey-24.pdf)\n- [Kennis Netwerk Synthetische Data](https://online.rijksinnovatiecommunity.nl/groups/399-kennisnetwerk-synthetischedata/welcome) (Nederlandse organisaties)\n- [Synthetische data portaal van DUO](https://duo.nl/open_onderwijsdata/footer/synthetische-data.jsp)\n- [CART: synthpop resources](https://synthpop.org.uk/resources.html)\n- [Gaussian Copula - Synthetic Data Vault](https://docs.sdv.dev/sdv)',
     },
     biasAnalysis: {
         dataSetPreview: {