Skip to content

Commit a3be459

Browse files
authored
Merge pull request #105 from NGO-Algorithm-Audit/JFP_edits
Jfp edits
2 parents 74f71df + 5fef8c2 commit a3be459

File tree

9 files changed

+54
-32
lines changed

9 files changed

+54
-32
lines changed

notebooks/synthetic data generation tool/GC_drop_LawSchoolAdmissionBar.ipynb

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -553,9 +553,24 @@
553553
},
554554
{
555555
"cell_type": "code",
556-
"execution_count": 11,
556+
"execution_count": 14,
557557
"metadata": {},
558-
"outputs": [],
558+
"outputs": [
559+
{
560+
"ename": "ValueError",
561+
"evalue": "All objects passed were None",
562+
"output_type": "error",
563+
"traceback": [
564+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
565+
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
566+
"Cell \u001b[0;32mIn[14], line 22\u001b[0m\n\u001b[1;32m 20\u001b[0m encoders[col] \u001b[38;5;241m=\u001b[39m encoder\n\u001b[1;32m 21\u001b[0m data\u001b[38;5;241m.\u001b[39mdrop(columns\u001b[38;5;241m=\u001b[39m[col], inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m---> 22\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconcat\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreset_index\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdrop\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtransformed_data\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreset_index\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdrop\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minplace\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m dtype \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnumerical\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 25\u001b[0m scaler \u001b[38;5;241m=\u001b[39m StandardScaler(with_mean\u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m, with_std\u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m)\n",
567+
"File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/reshape/concat.py:382\u001b[0m, in \u001b[0;36mconcat\u001b[0;34m(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)\u001b[0m\n\u001b[1;32m 379\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m copy \u001b[38;5;129;01mand\u001b[39;00m using_copy_on_write():\n\u001b[1;32m 380\u001b[0m copy \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m--> 382\u001b[0m op \u001b[38;5;241m=\u001b[39m \u001b[43m_Concatenator\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 383\u001b[0m \u001b[43m \u001b[49m\u001b[43mobjs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 384\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maxis\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 385\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 386\u001b[0m \u001b[43m \u001b[49m\u001b[43mjoin\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkeys\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 388\u001b[0m \u001b[43m \u001b[49m\u001b[43mlevels\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlevels\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 389\u001b[0m \u001b[43m \u001b[49m\u001b[43mnames\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnames\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 390\u001b[0m \u001b[43m \u001b[49m\u001b[43mverify_integrity\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mverify_integrity\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 391\u001b[0m \u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 392\u001b[0m \u001b[43m \u001b[49m\u001b[43msort\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 393\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 395\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m op\u001b[38;5;241m.\u001b[39mget_result()\n",
568+
"File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/reshape/concat.py:445\u001b[0m, in \u001b[0;36m_Concatenator.__init__\u001b[0;34m(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)\u001b[0m\n\u001b[1;32m 442\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverify_integrity \u001b[38;5;241m=\u001b[39m verify_integrity\n\u001b[1;32m 443\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcopy \u001b[38;5;241m=\u001b[39m copy\n\u001b[0;32m--> 445\u001b[0m objs, keys \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_clean_keys_and_objs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobjs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkeys\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 447\u001b[0m \u001b[38;5;66;03m# figure out what our result ndim is going to be\u001b[39;00m\n\u001b[1;32m 448\u001b[0m ndims \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_ndims(objs)\n",
569+
"File \u001b[0;32m/opt/homebrew/lib/python3.11/site-packages/pandas/core/reshape/concat.py:541\u001b[0m, in \u001b[0;36m_Concatenator._clean_keys_and_objs\u001b[0;34m(self, objs, keys)\u001b[0m\n\u001b[1;32m 538\u001b[0m keys \u001b[38;5;241m=\u001b[39m Index(clean_keys, name\u001b[38;5;241m=\u001b[39mname, dtype\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mgetattr\u001b[39m(keys, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 540\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(objs_list) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 541\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAll objects passed were None\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 543\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m objs_list, keys\n",
570+
"\u001b[0;31mValueError\u001b[0m: All objects passed were None"
571+
]
572+
}
573+
],
559574
"source": [
560575
"from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler\n",
561576
"\n",
@@ -578,7 +593,7 @@
578593
" transformed_data = _encode_categorical(data[col], encoder)\n",
579594
" encoders[col] = encoder\n",
580595
" data.drop(columns=[col], inplace=True)\n",
581-
" data = pd.concat([data, transformed_data], axis=1)\n",
596+
" data = pd.concat([data.reset_index(drop=True, inplace=True), transformed_data.reset_index(drop=True, inplace=True)], axis=1)\n",
582597
"\n",
583598
" elif dtype == \"numerical\":\n",
584599
" scaler = StandardScaler(with_mean= False, with_std= False)\n",
@@ -588,7 +603,7 @@
588603
},
589604
{
590605
"cell_type": "code",
591-
"execution_count": 12,
606+
"execution_count": 13,
592607
"metadata": {},
593608
"outputs": [
594609
{
@@ -616,7 +631,7 @@
616631
},
617632
{
618633
"cell_type": "code",
619-
"execution_count": 13,
634+
"execution_count": null,
620635
"metadata": {},
621636
"outputs": [
622637
{
@@ -637,7 +652,7 @@
637652
},
638653
{
639654
"cell_type": "code",
640-
"execution_count": 14,
655+
"execution_count": null,
641656
"metadata": {},
642657
"outputs": [
643658
{
@@ -663,7 +678,7 @@
663678
},
664679
{
665680
"cell_type": "code",
666-
"execution_count": 15,
681+
"execution_count": null,
667682
"metadata": {},
668683
"outputs": [
669684
{

src/assets/bias-detection-python-code.tsx

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,8 @@ def run():
332332
'iterations': localIterations,
333333
'minClusterSize': localClusterSize,
334334
'performanceMetric': bias_score,
335-
'dataType': dataTypeText
335+
'dataType': dataTypeText,
336+
'higherIsBetter': 'biasAnalysis.higherIsBetter' if higherIsBetter else 'biasAnalysis.lowerIsBetter'
336337
}
337338
}))
338339
setResult(json.dumps({
@@ -375,7 +376,12 @@ def run():
375376
'defaultIndex': 0,
376377
'labelKey': 'biasAnalysis.clusterinResults.label',
377378
'valueKey' : 'biasAnalysis.clusterinResults.valueText',
378-
'values': biasInClusters
379+
'values': biasInClusters,
380+
'params': {
381+
'numZeroes': int(numZeros),
382+
'totalRecords': int(totalRecords),
383+
'clusterCount': clusterCount
384+
}
379385
}))
380386
setResult(json.dumps({
381387
'type': 'text',
@@ -402,8 +408,6 @@ def run():
402408
403409
404410
decoded_X_test["cluster_label"] = y_test
405-
406-
# ----
407411
408412
if localDataType == 'numeric':
409413
test_df["cluster_label"] = y_test
@@ -443,10 +447,6 @@ def run():
443447
print("The most biased cluster has a significantly higher average bias metric than the rest of the dataset.")
444448
else:
445449
print("No significant difference in average bias metric between the most biased cluster and the rest of the dataset.")
446-
# setResult(json.dumps({
447-
# 'type': 'heading',
448-
# 'headingKey': 'biasAnalysis.nodifference.heading',
449-
# }))
450450
451451
setResult(json.dumps({
452452
'type': 'heading',
@@ -482,12 +482,7 @@ def run():
482482
charts = []
483483
484484
for i, var in enumerate(variables):
485-
486-
#setResult(json.dumps({
487-
# 'type': 'heading',
488-
# 'headingKey': 'biasAnalysis.distribution.heading',
489-
# 'params': {'variable': var}
490-
#}))
485+
491486
print(f"means: {var}")
492487
print(overall_means[var])
493488
print(means[var])

src/components/BiasSettings.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ export default function BiasSettings({
7676
resolver: zodResolver(FormSchema),
7777
defaultValues: {
7878
dataType: 'numeric',
79-
whichPerformanceMetricValueIsBetter: 'higher',
79+
whichPerformanceMetricValueIsBetter: 'lower',
8080
},
8181
});
8282
const [iter, setIter] = useState([10]);

src/components/componentMapper.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,7 @@ export default function ComponentMapper({
267267
defaultIndex: resultItem.defaultIndex,
268268
labelKey: resultItem.labelKey,
269269
valueKey: resultItem.valueKey,
270+
params: resultItem.params || {},
270271
}}
271272
/>
272273
</ErrorBoundary>

src/components/composed-components/ClusterCategoriesDistributionAccordeonContent.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ const ClusterCategoriesDistributionAccordeonContent = ({
8080
chart.selectFilterGroup ===
8181
defaultCategory) ||
8282
!chart.selectFilterGroup ? (
83-
<div className="hideonprint">
83+
<div className="hideonprint w-full">
8484
<h5
8585
key={index}
8686
className="text-gray-800 font-semibold"
@@ -107,7 +107,7 @@ const ClusterCategoriesDistributionAccordeonContent = ({
107107
/>
108108
</div>
109109
) : null}
110-
<div className="hidden showonprint">
110+
<div className="hidden showonprint overflow-x-hidden">
111111
<h5
112112
key={index}
113113
className="text-gray-800 font-semibold"

src/components/composed-components/ClusterNumericalVariableDistributionAccordeonContent.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ const ClusterNumericalVariableDistributionAccordeonContent = ({
7171
chart.selectFilterGroup ===
7272
defaultCategory) ||
7373
!chart.selectFilterGroup ? (
74-
<div className="hideonprint">
74+
<div className="hideonprint w-full">
7575
<h5
7676
key={chartIndex}
7777
className="text-gray-800 font-semibold"
@@ -91,7 +91,7 @@ const ClusterNumericalVariableDistributionAccordeonContent = ({
9191
/>
9292
</div>
9393
) : null}
94-
<div className="hidden showonprint">
94+
<div className="hidden showonprint overflow-x-hidden">
9595
<h5
9696
key={`SingleBarChart-print-${chartIndex}`}
9797
className="text-gray-800 font-semibold"

src/components/composed-components/TextValueSelect.tsx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ const TextValueSelect = ({
1111
defaultIndex: number;
1212
labelKey: string;
1313
valueKey: string;
14+
params: Record<string, string>;
1415
};
1516
}) => {
1617
const [selectedIndex, setSelectedIndex] = useState<number | null>();
@@ -39,6 +40,7 @@ const TextValueSelect = ({
3940
<div>
4041
<label className="text-sm font-semibold">
4142
{t(data.valueKey, {
43+
...data.params,
4244
index: selectedIndex ?? data.defaultIndex,
4345
value,
4446
})}

src/locales/en.ts

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ export const en = {
181181
gaussianCopulaModelTitle: '3. Method: Gaussian Copula model',
182182
cartModelDescription:
183183
'The CART (Classification and Regression Trees) method generates synthetic data by learning patterns from real data through a decision tree that splits data into homogeneous groups based on feature values. It predicts averages for numerical data and assigns the most common category for categorical data, using these predictions to create new synthetic points.\n \n {{samples}} synthetic data points are generated.',
184-
evaluationOfGeneratedDataTitle: '4. Evaluation of generated synthetic data',
184+
evaluationOfGeneratedDataTitle:
185+
'4. Evaluation of generated synthetic data',
185186
distributionsTitle: '4.1 Distributions',
186187
diagnosticsReportTitle: '4.2. Diagnostic report',
187188
diagnosticsTitle: 'Diagnostic Results',
@@ -274,11 +275,11 @@ missing data are imputed. For {tooltip:syntheticData.missingDataMCARTooltip}Miss
274275
- <i class="font-serif">H</i><sub>0</sub>: no difference in bias variable between the most deviating cluster and the rest of the dataset
275276
- <i class="font-serif">H</i><sub>1</sub>: difference in bias variable between the most deviating cluster and the rest of the dataset
276277
277-
A two-sided t-test is performed to accept or reject <i class="font-serif">H</i><sub>0</sub>:.
278+
A two-sided t-test is performed to accept or reject <i class="font-serif">H</i><sub>0</sub>:
278279
279280
{tooltip:biasAnalysis.p_valueTooltip}p_value{/tooltip} : {{p_val}}
280281
`,
281-
p_valueTooltip: `p_value tooltip`,
282+
p_valueTooltip: `The p-value represents the probability of incorrectly rejecting the null hypothesis (H<sub>0</sub>) when it is actually true. A commonly used threshold is p≤0.05, which is the probability deemed sufficiently low to reject H<sub>0</sub> in favor of the alternative hypothesis (H<sub>1</sub>).`,
282283
dataSetPreview: {
283284
heading: '1. Preview of data',
284285
},
@@ -303,6 +304,8 @@ A two-sided t-test is performed to accept or reject <i class="font-serif">H</i><
303304
In this example, we analyze which group is most adversely affected by the risk prediction algorithm. We do this by applying the clustering algorithm on the dataset previewed below. The column "is_recid" indicates whether a defendant reoffended or not (1: yes, 0: no). The "score_text" column indicates whether a defendant was predicted to reoffend (1: yes, 0: no). The column "false_positive" (FP) represents cases where a defendant was predicted to reoffended by the algorithm, but didn't do so (1: FP, 0: no FP). A preview of the data can be found below. The column "false_positive" is used as the bias variable.
304305
`,
305306
},
307+
higherIsBetter: 'Higher value of bias variable is better',
308+
lowerIsBetter: 'Lower value of bias variable is better',
306309
parameters: {
307310
heading: '2. Hyperparameters selected for clustering',
308311
iterations: 'Number of iterations: {{value}}',
@@ -313,6 +316,7 @@ In this example, we analyze which group is most adversely affected by the risk p
313316
- Minimal cluster size: {{minClusterSize}}
314317
- Bias variable: {{performanceMetric}}
315318
- Data type: {{dataType}}
319+
- Bias variable interpretation: $t({{higherIsBetter}})
316320
`,
317321
},
318322
distributionOfFeaturesAcrossClustersAccordeonTitle:
@@ -366,7 +370,8 @@ In this example, we analyze which group is most adversely affected by the risk p
366370
- Number of clusters detected: {{clusterCount}}
367371
`,
368372
label: 'Choose cluster to show number of datapoints for',
369-
valueText: 'Number of datapoints in cluster {{index}}: {{value}}',
373+
valueText:
374+
'Number of datapoints in cluster {{index}}: {{value}} / {{totalRecords}}',
370375
},
371376
higherAverage: `The most deviating cluster has statistically significant different bias variable than the rest of the dataset.`,
372377
noSignificance: `No statistically significant difference in bias variable between the most biased cluster and the rest of the dataset.`,

src/locales/nl.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,9 @@ Er wordt een tweezijdige t-toets uitgevoerd om <i class="font-serif">H</i><sub>0
303303
304304
{tooltip:biasAnalysis.p_valueTooltip}p-waarde{/tooltip} : {{p_val}}
305305
`,
306-
p_valueTooltip: `p-waarde tooltip`,
306+
p_valueTooltip: `De p-waarde is de kans om de nulhypothese (H<sub>0</sub>) onterecht te verwerpen wanneer deze in werkelijkheid waar is. Een veelgebruikte drempelwaarde is p≤0,05, wat wordt beschouwd als een voldoende lage kans om H<sub>0</sub> te verwerpen en de alternatieve hypothese (H<sub>1</sub>) te accepteren.`,
307+
higherIsBetter: 'Hogere waarde van bias variabele is beter',
308+
lowerIsBetter: 'Lagere waarde van bias variabele is beter',
307309
parameters: {
308310
heading: '2. Geselecteerde hyperparameters',
309311
iterations: 'Aantal iteraties: {{value}}',
@@ -316,6 +318,7 @@ Er wordt een tweezijdige t-toets uitgevoerd om <i class="font-serif">H</i><sub>0
316318
- Minimale clustergrootte: {{minClusterSize}}
317319
- Bias variabele: {{performanceMetric}}
318320
- Gegevenstype: {{dataType}}
321+
- Interpretatie van bias variabele: $t({{higherIsBetter}}) is better
319322
`,
320323
},
321324
distribution: {
@@ -365,7 +368,8 @@ Er wordt een tweezijdige t-toets uitgevoerd om <i class="font-serif">H</i><sub>0
365368
- Aantal gevonden clusters: {{clusterCount}}
366369
`,
367370
label: 'Kies cluster om het aantal datapunten voor weer te geven',
368-
valueText: 'Aantal datapunten in cluster {{index}}: {{value}}',
371+
valueText:
372+
'Aantal datapunten in cluster {{index}}: {{value}} / {{totalRecords}}',
369373
},
370374
higherAverage: `Het meest afwijkende cluster heeft statistisch significant andere bias variabele dan de rest van de dataset.`,
371375
noSignificance: `Het meest afwijkende cluster heeft statistisch significant geen andere bias variabele dan de rest van de dataset.`,

0 commit comments

Comments
 (0)