Skip to content

Commit dd58861

Browse files
authored
Merge pull request #87 from NGO-Algorithm-Audit/feature/ubd-last-changes-pre-demos
Feature/ubd last changes pre demos
2 parents 643c519 + 76e0356 commit dd58861

15 files changed

+1646
-374
lines changed

notebooks/synthetic data generation tool/GC_SocialDiagnosis2011.ipynb

Lines changed: 1070 additions & 56 deletions
Large diffs are not rendered by default.

src/assets/bias-detection-python-code.tsx

Lines changed: 92 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -165,58 +165,6 @@ def chi2_test_on_cluster(decoded_X_test, bias_score, cluster_label):
165165
166166
return comparisons
167167
168-
def diffDataframe(df, features, type=None, cluster1=None, cluster2=None):
169-
'''
170-
Creates difference dataframe, for numerical and categorical
171-
data: Takes dataframe of two clusters of interest and
172-
computes difference in means. Default to analyze most deviating
173-
cluster vs rest of the dataset, except specified otherwise.
174-
'''
175-
# Cluster comparison (optional)
176-
if cluster1 != None and cluster2 != None:
177-
df1 = df[df['Cluster'] == cluster1]
178-
df2 = df[df['Cluster'] == cluster2]
179-
else:
180-
df1 = df[df['Cluster'] == 0]
181-
df2 = df[df['Cluster'] != 0]
182-
183-
n_df1 = df1.shape[0]
184-
n_df2 = df2.shape[0]
185-
186-
diff_dict = {}
187-
CI_dict = {}
188-
189-
for feat in features:
190-
sample1 = df1[feat]
191-
sample2 = df2[feat]
192-
193-
if type == 'Numerical':
194-
mean1 = np.mean(sample1)
195-
mean2 = np.mean(sample2)
196-
diff = mean1 - mean2
197-
diff_dict[feat] = diff
198-
else:
199-
freq1 = sample1.value_counts()
200-
freq2 = sample2.value_counts()
201-
diff = freq1 - freq2
202-
diff_dict[feat] = diff
203-
204-
if type == 'Numerical':
205-
pd.set_option('display.float_format', lambda x: '%.5f' % x)
206-
diff_df = pd.DataFrame.from_dict(diff_dict, orient='index', columns=['Difference'])
207-
else:
208-
diff_df = pd.DataFrame()
209-
pd.set_option('display.float_format', lambda x: '%.5f' % x)
210-
211-
for _, value in diff_dict.items():
212-
df_temp = pd.DataFrame(value)
213-
diff_df = pd.concat([diff_df,df_temp], axis=0,)
214-
215-
diff_df = diff_df.fillna(0)
216-
diff_df.columns = ['Difference']
217-
218-
return(diff_df)
219-
220168
def run():
221169
csv_data = StringIO(data)
222170
df = pd.read_csv(csv_data)
@@ -227,7 +175,7 @@ def run():
227175
if isDemo:
228176
bias_score = "false_positive"
229177
localDataType = "categorical"
230-
localIterations = 20
178+
localIterations = iterations # 20
231179
232180
print (f"Using demo parameters: bias_score={bias_score}, targetColumn={targetColumn}, dataType={localDataType}, iterations={iterations}")
233181
@@ -297,7 +245,7 @@ def run():
297245
print(f"X_train shape: {X_train.shape}")
298246
299247
if isDemo:
300-
localClusterSize = X_train.shape[0]*0.01
248+
localClusterSize = clusterSize # X_train.shape[0]*0.01
301249
else:
302250
localClusterSize = clusterSize
303251
@@ -317,7 +265,7 @@ def run():
317265
cluster_df = pd.DataFrame(hbac.scores_, columns=['Cluster scores'])
318266
319267
n_most_bias = np.sum(hbac.labels_ == 0) # number of datapoints in the most deviating cluster ... user should be able to select the cluster to analyze
320-
print(f"Number of datapoints in most deviating cluster: {n_most_bias}/{train_df.shape[0]}")
268+
# print(f"Number of datapoints in most deviating cluster: {n_most_bias}/{train_df.shape[0]}")
321269
print(f"Number of clusters: {hbac.n_clusters_}")
322270
print(f"Bias metric scores: {hbac.scores_}")
323271
@@ -328,7 +276,9 @@ def run():
328276
329277
# df['Cluster'] = hbac.labels_
330278
331-
279+
biasInClusters = []
280+
for i in range(clusterCount):
281+
biasInClusters.append( int(np.sum(hbac.labels_ == i)))
332282
333283
334284
if isDemo:
@@ -404,6 +354,15 @@ def run():
404354
'clusterCount': clusterCount
405355
}
406356
}))
357+
358+
setResult(json.dumps({
359+
'type': 'TextValueSelect',
360+
'key': 'biasAnalysis.clusterinResults.description',
361+
'defaultIndex': 0,
362+
'labelKey': 'biasAnalysis.clusterinResults.label',
363+
'valueKey' : 'biasAnalysis.clusterinResults.valueText',
364+
'values': biasInClusters
365+
}))
407366
setResult(json.dumps({
408367
'type': 'text',
409368
'data': ''
@@ -457,19 +416,23 @@ def run():
457416
'type': 'text',
458417
'key': 'biasAnalysis.testingStatisticalSignificance',
459418
'params': {
460-
't_stat': t_stat,
461-
'p_val': p_val
419+
'p_val': "{:.3f}".format(p_val)
462420
}
463421
}))
464422
423+
setResult(json.dumps({
424+
'type': 'text',
425+
'key': 'biasAnalysis.higherAverage' if p_val < 0.05 else 'biasAnalysis.noSignificance'
426+
}))
427+
465428
if p_val < 0.05:
466429
print("The most biased cluster has a significantly higher average bias metric than the rest of the dataset.")
467430
else:
468431
print("No significant difference in average bias metric between the most biased cluster and the rest of the dataset.")
469-
setResult(json.dumps({
470-
'type': 'heading',
471-
'headingKey': 'biasAnalysis.nodifference.heading',
472-
}))
432+
# setResult(json.dumps({
433+
# 'type': 'heading',
434+
# 'headingKey': 'biasAnalysis.nodifference.heading',
435+
# }))
473436
474437
setResult(json.dumps({
475438
'type': 'heading',
@@ -492,60 +455,56 @@ def run():
492455
variables = X_test.columns.tolist()
493456
overall_means = test_df[variables].mean()
494457
458+
dropdownCategories = []
459+
for i, column in enumerate(X_test.columns):
460+
dropdownCategories.append(column)
461+
495462
# Plot bar charts for each variable, showing means for each cluster and overall mean as red line
496463
n_vars = len(variables)
497464
n_cols = 2
498465
n_rows = int(np.ceil(n_vars / n_cols))
499466
467+
charts = []
468+
500469
for i, var in enumerate(variables):
501470
502-
setResult(json.dumps({
503-
'type': 'heading',
504-
'headingKey': 'biasAnalysis.distribution.heading',
505-
'params': {'variable': var}
506-
}))
471+
#setResult(json.dumps({
472+
# 'type': 'heading',
473+
# 'headingKey': 'biasAnalysis.distribution.heading',
474+
# 'params': {'variable': var}
475+
#}))
507476
print(f"means: {var}")
508477
print(overall_means[var])
509478
print(means[var])
510479
print(f"========================")
511-
setResult(json.dumps({
512-
'type': 'barchart',
513-
'title': var,
514-
'meanValue': overall_means[var],
515-
'data': means[var].to_json(orient='records')
516-
}))
517-
518-
if p_val < 0.05:
519-
520-
if localDataType == 'numeric':
521-
# Calculate mean per cluster for each variable
522-
means = test_df.groupby("cluster_label").mean()
523-
524-
# Calculate overall mean for each variable (excluding cluster_label)
525-
variables = X_test.columns.tolist()
526-
overall_means = test_df[variables].mean()
527-
528-
# Plot bar charts for each variable, showing means for each cluster and overall mean as red line
529-
n_vars = len(variables)
530-
n_cols = 2
531-
n_rows = int(np.ceil(n_vars / n_cols))
480+
532481
533-
for i, var in enumerate(variables):
534-
535-
setResult(json.dumps({
536-
'type': 'heading',
537-
'headingKey': 'biasAnalysis.distribution.heading',
538-
'params': {'variable': var}
539-
}))
540-
541-
setResult(json.dumps({
542-
'type': 'barchart',
482+
charts.append({
483+
'yAxisLabel': 'distribution.frequency',
484+
'type': 'clusterNumericalVariableDistribution',
485+
'headingKey': 'biasAnalysis.distribution.heading',
543486
'title': var,
487+
'meanValue': overall_means[var],
544488
'data': means[var].to_json(orient='records'),
545-
'meanValue': overall_means[var]
546-
}))
489+
'params': {'variable': var},
490+
'selectFilterGroup' : var,
491+
'defaultFilter': X_test.columns[0]
492+
})
547493
494+
setResult(json.dumps({
495+
'type': 'clusterNumericalVariableDistributionAccordeon',
496+
'clusterCount': clusterCount,
497+
'charts': charts,
498+
'values': dropdownCategories,
499+
'titleKey': "biasAnalysis.numericalVariableDistributionAcrossClustersAccordeonTitle",
500+
'defaultValue': X_test.columns[0]
501+
}))
502+
503+
if p_val < 0.05:
548504
505+
if localDataType == 'numeric':
506+
# see above for the code
507+
print("Statistically significant differences in means found.")
549508
else:
550509
# Create subplots for each column
551510
columns_to_analyze = decoded_X_test.columns[:-1] # Exclude 'cluster_label' column
@@ -556,11 +515,7 @@ def run():
556515
for i, column in enumerate(columns_to_analyze):
557516
dropdownCategories.append(column)
558517
559-
setResult(json.dumps({
560-
'type': 'clusterCategorieSelect',
561-
'values': dropdownCategories,
562-
'defaultValue': columns_to_analyze[0]
563-
}))
518+
charts = []
564519
565520
for i, column in enumerate(columns_to_analyze):
566521
print(f"Analyzing column: {column}")
@@ -569,22 +524,39 @@ def run():
569524
percentages = grouped_data.div(grouped_data.sum(axis=1), axis=0) * 100
570525
571526
category_values = grouped_data.columns.tolist()
572-
573527
574-
setResult(json.dumps({
528+
means = []
529+
overall_counts = decoded_X_test[column].value_counts(normalize=True) * 100
530+
for cat_value, avg_pct in overall_counts.items():
531+
means.append({
532+
'category': cat_value,
533+
'mean': avg_pct
534+
})
535+
536+
537+
charts.append({
575538
'type': 'clusterCategorieDistribution',
576539
'headingKey': 'biasAnalysis.distribution.heading',
577540
'title': column,
578541
'categories': category_values,
579542
'data': percentages.T.to_json(orient='records'),
580543
'selectFilterGroup' : column,
581544
'params': {'variable': column},
582-
'defaultFilter': columns_to_analyze[0]
583-
}))
545+
'defaultFilter': columns_to_analyze[0],
546+
'means': means,
547+
'isViridis': True,
548+
'yAxisLabel': 'distribution.frequency'
549+
})
550+
551+
584552
585553
setResult(json.dumps({
586-
'type': 'cluster_legend',
554+
'type': 'clusterCategorieDistributionAccordeon',
587555
'clusterCount': clusterCount,
556+
'charts': charts,
557+
'values': dropdownCategories,
558+
'titleKey': "biasAnalysis.distributionOfFeaturesAcrossClustersAccordeonTitle",
559+
'defaultValue': columns_to_analyze[0]
588560
}))
589561
590562
df_most_biased_cluster = most_biased_cluster_df
@@ -623,6 +595,16 @@ def run():
623595
'headingKey': 'biasAnalysis.conclusion'
624596
}))
625597
598+
setResult(json.dumps({
599+
'type': 'text',
600+
'key': 'biasAnalysis.conclusionDescription'
601+
}))
602+
603+
setResult(json.dumps({
604+
'type': 'export-button',
605+
}))
606+
607+
626608
setResult(json.dumps({
627609
'type': 'heading',
628610
'headingKey': 'biasAnalysis.moreInformationHeading'

src/components/BiasSettings.tsx

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import {
88
} from '@/components/ui/select';
99
import { Slider } from '@/components/ui/slider';
1010
import { RadioGroup, RadioGroupItem } from '@/components/ui/radio-group';
11-
import CSVReader, { csvReader } from './CSVReader';
11+
import CSVReader from './CSVReader';
1212
import { useEffect, useState } from 'react';
1313
import { Button } from './ui/button';
1414
import { ArrowDown, ArrowRight, InfoIcon } from 'lucide-react';
@@ -58,7 +58,14 @@ export default function BiasSettings({
5858
isErrorDuringAnalysis,
5959
}: {
6060
onRun: (params: BiasDetectionParameters) => void;
61-
onDataLoad: csvReader['onChange'];
61+
onDataLoad: (
62+
data: Record<string, string>[],
63+
stringified: string,
64+
fileName: string,
65+
demo?: boolean,
66+
columnsCount?: number,
67+
params?: BiasDetectionParameters
68+
) => void;
6269
isLoading: boolean;
6370
isErrorDuringAnalysis: boolean;
6471
isInitialised: boolean;
@@ -144,7 +151,18 @@ export default function BiasSettings({
144151
file.data as Record<string, string>[],
145152
Papa.unparse(file.data),
146153
'demo',
147-
true
154+
true,
155+
undefined,
156+
{
157+
clusterSize: clusters[0],
158+
iterations: iter[0],
159+
targetColumn: '',
160+
dataType: '',
161+
higherIsBetter:
162+
form.getValues().whichPerformanceMetricValueIsBetter ===
163+
'higher',
164+
isDemo: true,
165+
}
148166
);
149167
};
150168

src/components/MarkdownWithTooltips.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ export function MarkdownWithTooltips({
2929
<Markdown
3030
className={className}
3131
remarkPlugins={[remarkInfoTooltip, remarkGfm]}
32-
rehypePlugins={[rehypeInfoTooltip, rehypeRaw]}
32+
rehypePlugins={[rehypeRaw, rehypeInfoTooltip]}
3333
components={{
3434
// @ts-expect-error - math is a custom components
3535
TooltipWrapper,

src/components/SyntheticDataSettings.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ export default function SyntheticDataSettings({
9595
}, [data]);
9696

9797
const onDemoRun = async () => {
98-
const file = await fetch('/LawSchoolAdmissionBar_v1.csv')
98+
const file = await fetch('/LawSchoolAdmissionBar_small.csv')
9999
.then(response => response.text())
100100
.then(data => Papa.parse(data, { header: true }));
101101

0 commit comments

Comments
 (0)