Skip to content

Commit 1d20a4d

Browse files
authored
Merge pull request #106 from NGO-Algorithm-Audit/feature/fixes-11-jun-ubdt-2
Feature/fixes 11 jun ubdt 2
2 parents a3be459 + 5e04b3d commit 1d20a4d

File tree

7 files changed

+105
-92
lines changed

7 files changed

+105
-92
lines changed

src/assets/bias-detection-python-code.tsx

Lines changed: 74 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -447,74 +447,69 @@ def run():
447447
print("The most biased cluster has a significantly higher average bias metric than the rest of the dataset.")
448448
else:
449449
print("No significant difference in average bias metric between the most biased cluster and the rest of the dataset.")
450-
451-
setResult(json.dumps({
452-
'type': 'heading',
453-
'headingKey': 'biasAnalysis.distribution.mainHeading'
454-
}))
455-
456-
450+
457451
# visualize the clusters
458452
459453
# Group by cluster_label and count the occurrences
460454
cluster_counts = decoded_X_test["cluster_label"].value_counts()
461455
print(f"cluster_counts: {cluster_counts}")
462456
457+
if p_val < 0.05:
463458
464-
if localDataType == 'numeric':
465-
# Calculate mean per cluster for each variable
466-
means = test_df.groupby("cluster_label").mean()
467-
468-
# Calculate overall mean for each variable (excluding cluster_label)
469-
variables = X_test.columns.tolist()
470-
overall_means = test_df[variables].mean()
471-
472-
dropdownCategories = []
473-
for i, column in enumerate(X_test.columns):
474-
if column != bias_score:
475-
dropdownCategories.append(column)
476-
477-
# Plot bar charts for each variable, showing means for each cluster and overall mean as red line
478-
n_vars = len(variables)
479-
n_cols = 2
480-
n_rows = int(np.ceil(n_vars / n_cols))
481-
482-
charts = []
459+
setResult(json.dumps({
460+
'type': 'heading',
461+
'headingKey': 'biasAnalysis.distribution.mainHeading'
462+
}))
463+
464+
if localDataType == 'numeric':
465+
# Calculate mean per cluster for each variable
466+
means = test_df.groupby("cluster_label").mean()
483467
484-
for i, var in enumerate(variables):
485-
486-
print(f"means: {var}")
487-
print(overall_means[var])
488-
print(means[var])
489-
print(f"========================")
490-
468+
# Calculate overall mean for each variable (excluding cluster_label)
469+
variables = X_test.columns.tolist()
470+
overall_means = test_df[variables].mean()
491471
492-
charts.append({
493-
'yAxisLabel': 'distribution.frequency',
494-
'type': 'clusterNumericalVariableDistribution',
495-
'headingKey': 'biasAnalysis.distribution.heading',
496-
'title': var,
497-
'meanValue': overall_means[var],
498-
'data': means[var].to_json(orient='records'),
499-
'params': {'variable': var},
500-
'selectFilterGroup' : var,
501-
'defaultFilter': X_test.columns[0]
502-
})
472+
dropdownCategories = []
473+
for i, column in enumerate(X_test.columns):
474+
if column != bias_score:
475+
dropdownCategories.append(column)
476+
477+
# Plot bar charts for each variable, showing means for each cluster and overall mean as red line
478+
n_vars = len(variables)
479+
n_cols = 2
480+
n_rows = int(np.ceil(n_vars / n_cols))
481+
482+
charts = []
483+
484+
for i, var in enumerate(variables):
485+
486+
print(f"means: {var}")
487+
print(overall_means[var])
488+
print(means[var])
489+
print(f"========================")
490+
503491
504-
setResult(json.dumps({
505-
'type': 'clusterNumericalVariableDistributionAccordeon',
506-
'clusterCount': clusterCount,
507-
'charts': charts,
508-
'values': dropdownCategories,
509-
'titleKey': "biasAnalysis.numericalVariableDistributionAcrossClustersAccordeonTitle",
510-
'defaultValue': X_test.columns[0]
511-
}))
512-
513-
if p_val < 0.05:
492+
charts.append({
493+
'yAxisLabel': 'distribution.frequency',
494+
'type': 'clusterNumericalVariableDistribution',
495+
'headingKey': 'biasAnalysis.distribution.heading',
496+
'title': var,
497+
'meanValue': overall_means[var],
498+
'data': means[var].to_json(orient='records'),
499+
'params': {'variable': var},
500+
'selectFilterGroup' : var,
501+
'defaultFilter': X_test.columns[0]
502+
})
514503
515-
if localDataType == 'numeric':
516-
# see above for the code
517-
print("Statistically significant differences in means found.")
504+
setResult(json.dumps({
505+
'type': 'clusterNumericalVariableDistributionAccordeon',
506+
'clusterCount': clusterCount,
507+
'charts': charts,
508+
'values': dropdownCategories,
509+
'titleKey': "biasAnalysis.numericalVariableDistributionAcrossClustersAccordeonTitle",
510+
'defaultValue': X_test.columns[0]
511+
}))
512+
518513
else:
519514
# Create subplots for each column
520515
columns_to_analyze = [col for col in decoded_X_test.columns if col not in [bias_score, "cluster_label"]]
@@ -577,43 +572,44 @@ def run():
577572
setOutputData("mostBiasedCluster", df_most_biased_cluster.to_json(orient='records'))
578573
setOutputData("otherClusters", df_other.to_json(orient='records'))
579574
580-
575+
581576
setResult(json.dumps({
582577
'type': 'heading',
583578
'headingKey': 'biasAnalysis.conclusion'
584579
}))
585580
586-
setResult(json.dumps({
587-
'type': 'text',
588-
'key': 'biasAnalysis.conclusionDescription'
589-
}))
581+
590582
591583
# Calculate the difference in percentage for each category value between cluster 0 and the entire dataset
592584
diff_percentages = {}
593585
594586
# Select only cluster 0
595587
cluster_0 = decoded_X_test[decoded_X_test["cluster_label"] == 0]
596588
597-
if (localDataType == 'numeric'):
598-
599-
comparisons = t_test_on_cluster(test_df, bias_score, cluster_label=0)
600-
589+
if p_val < 0.05:
601590
setResult(json.dumps({
602-
'type': 'accordion',
603-
'titleKey': 'biasAnalysis.biasedCluster.accordionTitle',
604-
'comparisons': comparisons
591+
'type': 'text',
592+
'key': 'biasAnalysis.conclusionDescription'
605593
}))
606-
else:
607-
comparisons = chi2_test_on_cluster(decoded_X_test, bias_score, cluster_label=0)
608594
609-
setResult(json.dumps({
610-
'type': 'accordion',
611-
'titleKey': 'biasAnalysis.biasedCluster.accordionTitle',
612-
'comparisons': comparisons,
613-
'className': 'biasAnalysis-biasedClusterAccordion'
614-
}))
615-
595+
if (localDataType == 'numeric'):
596+
597+
comparisons = t_test_on_cluster(test_df, bias_score, cluster_label=0)
616598
599+
setResult(json.dumps({
600+
'type': 'accordion',
601+
'titleKey': 'biasAnalysis.biasedCluster.accordionTitle',
602+
'comparisons': comparisons
603+
}))
604+
else:
605+
comparisons = chi2_test_on_cluster(decoded_X_test, bias_score, cluster_label=0)
606+
607+
setResult(json.dumps({
608+
'type': 'accordion',
609+
'titleKey': 'biasAnalysis.biasedCluster.accordionTitle',
610+
'comparisons': comparisons,
611+
'className': 'biasAnalysis-biasedClusterAccordion'
612+
}))
617613
618614
619615
setResult(json.dumps({

src/assets/synthetic-data.tsx

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,8 @@ def run():
168168
169169
setResult(json.dumps({
170170
'type': 'list',
171-
'list': dataInfo
171+
'list': dataInfo,
172+
'translateValue' : True,
172173
}))
173174
174175
setResult(json.dumps({
@@ -255,11 +256,7 @@ def run():
255256
{
256257
'reportType': 'heading',
257258
'headingKey': 'syntheticData.handlingMissingDataTitle'
258-
},
259-
{
260-
'reportType': 'text',
261-
'textKey': 'syntheticData.handlingMissingDataDescription'
262-
},
259+
},
263260
{
264261
'reportType': 'table',
265262
'titleKey': 'syntheticData.handlingMissingDataTableTitle',
@@ -278,7 +275,7 @@ def run():
278275
},
279276
{
280277
'reportType': 'text',
281-
'textKey': 'syntheticData.cartModelDescription' if sdgMethod == 'cart' else 'syntheticData.gaussianCopulaModelDescription',
278+
'textKey': 'syntheticData.cartModelDescription' if sdgMethod == 'cart' else 'syntheticData.gcModelDescription',
282279
'params': {
283280
'samples': samples,
284281
}

src/components/SyntheticDataSettings.tsx

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,9 +305,17 @@ export default function SyntheticDataSettings({
305305
</div>
306306

307307
<div className="grid gap-3">
308-
<Label htmlFor="samples">
308+
<Label
309+
htmlFor="samples"
310+
className="flex flex-row items-center gap-1"
311+
>
309312
{t('syntheticData.form.fieldset.samples')} (
310313
{outputSamples})
314+
<IconInfoTooltip
315+
tooltipText={t(
316+
'syntheticData.form.fieldset.outputSamplesTooltip'
317+
)}
318+
/>
311319
</Label>
312320
<Slider
313321
id="samples"

src/components/componentMapper.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ export default function ComponentMapper({
8282
) => (
8383
<li key={`list-${index}`}>
8484
<span className="font-bold">{`${listItem.key}`}</span>
85-
{`: ${listItem.value}`}
85+
{`: ${resultItem.translateValue ? t(listItem.value) : listItem.value}`}
8686
</li>
8787
)
8888
);

src/components/pyodide/use-python.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ export const usePython = <T, TExport>(emptyParams: T) => {
5252
const runPython = useCallback(
5353
(message: { type: 'start'; params: { parameters: T } }) => {
5454
setLoading(true);
55-
setLoadingMessage('running analysis');
55+
setLoadingMessage('runningAnalysis');
5656

5757
setClusterInfo(undefined);
5858
setResult([]);

src/locales/en.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ export const en = {
2020
loadingPackages:
2121
'Loading core packages. On average this takes 10-15 seconds.',
2222
installingPackages: 'Installing additional packages...',
23+
runningAnalysis: 'Running analysis...',
24+
categorical: 'Categorical',
25+
numerical: 'Numerical',
2326
biasSettings: {
2427
dataType: {
2528
numeric: 'Numeric',
@@ -73,9 +76,9 @@ export const en = {
7376
analysisError: 'Error while analysing',
7477
noData: 'No data loaded',
7578
numericDataRequired:
76-
'Selected column must contain numerical data for k-means clustering.',
79+
'Not all data have same format, please change locally before attaching the data',
7780
categoricalDataRequired:
78-
'Selected column must contain categorical data for k-modes clustering.',
81+
'Not all data have same format, please change locally before attaching the data.',
7982
},
8083
actions: {
8184
tryItOut: 'Demo dataset',
@@ -149,6 +152,8 @@ export const en = {
149152
"When using Gaussian Copula, you can choose how to handle missing values (NaN values) in your dataset. 'Drop rows with NaN values' removes them completely, while 'Imputate NaN values' replaces them with mean values for numerical columns and mode values for categorical columns",
150153
},
151154
samples: 'Number of synthetic datapoints',
155+
outputSamplesTooltip:
156+
'Number of synthetic data points to be generated by the tool. Due to computational contstraints of browser-based synthetic data generation, the maximum is set to 5.000.',
152157
},
153158
actions: {
154159
tryItOut: 'Demo dataset',
@@ -181,6 +186,7 @@ export const en = {
181186
gaussianCopulaModelTitle: '3. Method: Gaussian Copula model',
182187
cartModelDescription:
183188
'The CART (Classification and Regression Trees) method generates synthetic data by learning patterns from real data through a decision tree that splits data into homogeneous groups based on feature values. It predicts averages for numerical data and assigns the most common category for categorical data, using these predictions to create new synthetic points.\n \n {{samples}} synthetic data points are generated.',
189+
gcModelDescription: `Gaussian Copula works in two main steps: 1. The real data is transformed into a uniform distribution. Correlations between variables are modeled using a multivariate normal distribution (the Gaussian copula); and 2. Synthetic data is created by sampling from this Gaussian copula and transforming the samples back to the original data distributions.\n \n {{samples}} synthetic data points are generated.`,
184190
evaluationOfGeneratedDataTitle:
185191
'4. Evaluation of generated synthetic data',
186192
distributionsTitle: '4.1 Distributions',

src/locales/nl.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ export const nl = {
1717
loadingPyodide: 'Python omgeving laden...',
1818
loadingPackages: 'Laden van packages. Dit duurt gemiddeld 10-15 seconden.',
1919
installingPackages: 'Aanvullende packages laden',
20+
runningAnalysis: 'Analyse uitvoeren...',
21+
categorical: 'Categorisch',
22+
numerical: 'Numeriek',
2023
biasSettings: {
2124
dataType: {
2225
numeric: 'Numeriek',
@@ -71,9 +74,9 @@ export const nl = {
7174
analysisError: 'Fout tijdens analyse',
7275
noData: 'Geen gegevens geladen',
7376
numericDataRequired:
74-
'Geselecteerde kolom moet numerieke data bevatten voor k-means clustering.',
77+
'Niet alle data hebben hetzelfde formaat, pas dit lokaal aan voordat je de data toevoegt.',
7578
categoricalDataRequired:
76-
'Geselecteerde kolom moet categorische data bevatten voor k-modes clustering.',
79+
'Niet alle data hebben hetzelfde formaat, pas dit lokaal aan voordat je de data toevoegt.',
7780
},
7881
actions: {
7982
tryItOut: 'Demo dataset',
@@ -149,6 +152,8 @@ export const nl = {
149152
'Bij gebruik van Gaussian Copula kunt u kiezen hoe u omgaat met ontbrekende waarden (NaN waarden) in uw dataset. Het verwijderen van rijen met NaN waarden verwijdert deze volledig, terwijl imputatie deze vervangt door gemiddelde waarden voor numerieke kolommen en modus waarden voor categorische kolommen',
150153
},
151154
samples: 'Aantal synthetische datapunten',
155+
outputSamplesTooltip:
156+
'Aantal synthetische datapunten die door de tool worden gegenereerd. Vanwege de rekencapaciteit van browser-gebaseerde datageneratie is het maximum ingesteld op 5.000.',
152157
},
153158
actions: {
154159
tryItOut: 'Demo dataset',
@@ -183,6 +188,7 @@ export const nl = {
183188
gaussianCopulaModelTitle: '3. Methode: Gaussian Copula model',
184189
cartModelDescription:
185190
'De CART-methode (Classification and Regression Trees) genereert synthetische data door patronen uit echte data te leren via een beslisboom die de data opdeelt in homogene groepen op basis van kenmerken. Voor numerieke data voorspelt de methode gemiddelden en voor categorische data wijst het de meest voorkomende categorie toe. Deze voorspellingen worden vervolgens gebruikt om synthetische datapunten te creëren.\n \n {{samples}} synthetische datapunten zijn gegenereerd.',
191+
gcModelDescription: `Gaussian Copula werkt in twee stappen: 1. De echte data worden getransformeerd naar een uniforme verdeling. Correlaties tussen variabelen worden gemodelleerd met een multivariate normale verdeling (de Gaussian copula); en 2. Synthetische data worden gegenereerd door te sampelen uit deze copula en de samples terug te transformeren naar de oorspronkelijke verdelingen.\n \n {{samples}} synthetische datapunten zijn gegenereerd.`,
186192
evaluationOfGeneratedDataTitle:
187193
'4. Evaluatie van gegenereerde synthetische data',
188194
distributionsTitle: '4.1 Distributies',

0 commit comments

Comments
 (0)