Skip to content

Commit 8df2dd4

Browse files
committed
python script gelijktrekken met notebook
1 parent 425e564 commit 8df2dd4

File tree

1 file changed

+59
-60
lines changed

1 file changed

+59
-60
lines changed

src/assets/bias-detection-python-code.tsx

Lines changed: 59 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ from js import higherIsBetter
3030
from js import isDemo
3131
from js import dataTypeText
3232
33-
def t_test_on_cluster(test_df, bias_score, cluster_label):
33+
def t_test_on_cluster(test_df, bias_variable, cluster_label):
3434
3535
# Prepare results dictionary
3636
t_test_results = {}
@@ -40,7 +40,7 @@ def t_test_on_cluster(test_df, bias_score, cluster_label):
4040
cluster_df = test_df[test_df["cluster_label"] == cluster_label]
4141
rest_df = test_df[test_df["cluster_label"] != cluster_label]
4242
43-
for var in test_df.drop(columns=[bias_score, "cluster_label"]).columns:
43+
for var in test_df.drop(columns=[bias_variable, "cluster_label"]).columns:
4444
# values in both partitions
4545
values_cluster = cluster_df[var]
4646
values_rest = rest_df[var]
@@ -92,7 +92,7 @@ def t_test_on_cluster(test_df, bias_score, cluster_label):
9292
9393
return comparisons
9494
95-
def chi2_test_on_cluster(decoded_X_test, bias_score, cluster_label):
95+
def chi2_test_on_cluster(decoded_X_test, bias_variable, cluster_label):
9696
9797
comparisons = []
9898
# prepare results dictionary
@@ -108,7 +108,7 @@ def chi2_test_on_cluster(decoded_X_test, bias_score, cluster_label):
108108
alpha = 0.05
109109
alpha_adj = alpha/(p-2)
110110
111-
for column in decoded_X_test.drop(columns=[bias_score, "cluster_label"]).columns:
111+
for column in decoded_X_test.drop(columns=[bias_variable, "cluster_label"]).columns:
112112
for value in list(decoded_X_test[column].unique()):
113113
114114
# create a 2x2 contingency table for this value: rows = [cluster_label, rest], columns = [value present, value absent]
@@ -197,11 +197,11 @@ def run():
197197
features = [col for col in df.columns if (col not in emptycols) and (col != targetColumn) and (not col.startswith('Unnamed'))]
198198
199199
if isDemo:
200-
bias_score = "false_positive"
200+
bias_variable = "false_positive"
201201
localDataType = "categorical"
202202
localIterations = iterations # 20
203203
204-
print (f"Using demo parameters: bias_score={bias_score}, targetColumn={targetColumn}, dataType={localDataType}, iterations={iterations}")
204+
print (f"Using demo parameters: bias_variable={bias_variable}, targetColumn={targetColumn}, dataType={localDataType}, iterations={iterations}")
205205
206206
# Select relevant columns
207207
columns_of_interest = ["age_cat", "sex", "race", "c_charge_degree", "is_recid", "score_text"]
@@ -215,15 +215,15 @@ def run():
215215
filtered_df["score_text"] = filtered_df["score_text"].map(lambda x: 1 if x == "High" else 0)
216216
filtered_df["is_recid"] = filtered_df["is_recid"].astype("category")
217217
218-
filtered_df[bias_score] = ((filtered_df["is_recid"] == 0) & (filtered_df["score_text"] == 1)).astype(int)
218+
filtered_df[bias_variable] = ((filtered_df["is_recid"] == 0) & (filtered_df["score_text"] == 1))
219219
220220
221221
else:
222222
filtered_df = df
223-
bias_score = targetColumn
223+
bias_variable = targetColumn
224224
localDataType = dataType
225225
localIterations = iterations
226-
print (f"Using parameters: bias_score={bias_score}, targetColumn={targetColumn}, dataType={localDataType}, iterations={localIterations}")
226+
print (f"Using parameters: bias_variable={bias_variable}, targetColumn={targetColumn}, dataType={localDataType}, iterations={localIterations}")
227227
228228
if (dataType == 'numeric'):
229229
# Convert all columns to numeric
@@ -235,33 +235,35 @@ def run():
235235
236236
if localDataType == 'categorical':
237237
encoder = OrdinalEncoder()
238-
filtered_df[filtered_df.columns] = encoder.fit_transform(filtered_df).astype("int64")
238+
columns_to_encode = [col for col in filtered_df.columns if col != bias_variable]
239+
filtered_df[columns_to_encode] = encoder.fit_transform(filtered_df[columns_to_encode])
240+
# filtered_df[filtered_df.columns] = encoder.fit_transform(filtered_df).astype("int64")
239241
240242
print("filtered_df.dtypes:")
241243
print(filtered_df.dtypes)
242244
243-
df_no_bias_score = filtered_df.drop(columns=[bias_score])
244-
if df_no_bias_score.dtypes.nunique() == 1:
245+
df_no_bias_variable = filtered_df.drop(columns=[bias_variable])
246+
if df_no_bias_variable.dtypes.nunique() == 1:
245247
print('consistent data')
246248
else:
247249
print('not all columns in the provided dataset have the same data type')
248250
249-
250-
# split the data into training and testing sets
251-
train_df, test_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
252-
X_train = train_df.drop(columns=[bias_score])
253-
254-
scaleY = 1
251+
# Multiply bias variable with -1 if "Lower value of bias score is better", multiply by 1 if "Higher value of bias score is better"
252+
interpretationScalar = 1
255253
if higherIsBetter == 0:
256-
scaleY = -1;
257-
254+
interpretationScalar = -1;
258255
256+
filtered_df[bias_variable] = filtered_df[bias_variable] * interpretationScalar
259257
260-
# bias metric is negated because HBAC implementation in the package assumes that higher bias metric is better
261-
y_train = train_df[bias_score] * scaleY
258+
# split the data into training and testing sets
259+
train_df, test_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
260+
X_train = train_df.drop(columns=[bias_variable])
261+
y_train = train_df[bias_variable]
262262
263263
# remove the bias metric from the test set to prevent issues with decoding later
264-
X_test = test_df.drop(columns=[bias_score])
264+
X_test = test_df.drop(columns=[bias_variable])
265+
y_test = test_df[bias_variable]
266+
265267
266268
# display the shapes of the resulting datasets
267269
print(f"Training set shape: {train_df.shape}")
@@ -277,7 +279,7 @@ def run():
277279
278280
print(f"Using local iterations: {localIterations}")
279281
print(f"Using cluster size: {localClusterSize}")
280-
print(f"Using bias metric: {bias_score}")
282+
print(f"Using bias metric: {bias_variable}")
281283
282284
283285
if localDataType == 'numeric':
@@ -341,7 +343,7 @@ def run():
341343
'params': {
342344
'iterations': localIterations,
343345
'minClusterSize': localClusterSize,
344-
'performanceMetric': bias_score,
346+
'performanceMetric': bias_variable,
345347
'dataType': dataTypeText,
346348
'higherIsBetter': 'biasAnalysis.higherIsBetter' if higherIsBetter else 'biasAnalysis.lowerIsBetter'
347349
}
@@ -398,27 +400,31 @@ def run():
398400
'data': ''
399401
}))
400402
401-
y_test = hbac.predict(X_test.to_numpy())
403+
cluster_label_X_test = hbac.predict(X_test.to_numpy())
402404
403405
decoded_X_test = test_df.copy()
404406
405-
print("y_test:")
406-
print(y_test)
407+
print("cluster_label_X_test:")
408+
print(cluster_label_X_test)
407409
print("test_df:")
408410
print(test_df)
409411
410412
if localDataType == 'categorical':
411413
# decode X_test using the encoder
412-
decoded_X_test = encoder.inverse_transform(test_df)
414+
test_df_pred = test_df[columns_to_encode]
415+
decoded_X_test = encoder.inverse_transform(test_df_pred)
413416
414417
415418
# display the decoded DataFrame
416-
decoded_X_test = pd.DataFrame(decoded_X_test, columns=test_df.columns)
419+
decoded_X_test = pd.DataFrame(decoded_X_test, columns=test_df_pred.columns)
417420
print(decoded_X_test)
418421
419422
420-
decoded_X_test["cluster_label"] = y_test
421-
423+
# decoded_X_test["cluster_label"] = cluster_label_X_test
424+
decoded_X_test[bias_variable] = y_test.values
425+
decoded_X_test["cluster_label"] = cluster_label_X_test
426+
427+
422428
if localDataType == 'numeric':
423429
test_df["cluster_label"] = y_test
424430
most_biased_cluster_df = test_df[test_df["cluster_label"] == 0]
@@ -427,20 +433,20 @@ def run():
427433
most_biased_cluster_df = decoded_X_test[decoded_X_test["cluster_label"] == 0]
428434
rest_df = decoded_X_test[decoded_X_test["cluster_label"] != 0]
429435
430-
# Convert score_text to numeric
431-
bias_score_most_biased = pd.to_numeric(most_biased_cluster_df[bias_score])
432-
bias_score_rest = pd.to_numeric(rest_df[bias_score])
433436
434-
435-
# most disavanteagous bias variable is always minimum value of the bias variable
436-
most_biased_cluster_label = most_biased_cluster_df[bias_score].min()
437437
438+
# most disavanteagous bias variable is always minimum value of the bias variable
439+
most_biased_cluster_label = most_biased_cluster_df[bias_variable].min()
440+
441+
438442
# Perform Z-test for proportions
439-
most_biased_count = (most_biased_cluster_df[bias_score] == most_biased_cluster_label).sum()
443+
most_biased_count = (most_biased_cluster_df[bias_variable] == most_biased_cluster_label).sum()
440444
most_biased_total = len(most_biased_cluster_df)
441-
rest_count = (rest_df[bias_score] == most_biased_cluster_label).sum()
445+
rest_count = (rest_df[bias_variable] == most_biased_cluster_label).sum()
442446
rest_total = len(rest_df)
443447
448+
449+
444450
# Perform two-proportion z-test
445451
counts = np.array([most_biased_count, rest_count])
446452
nobs = np.array([most_biased_total, rest_total])
@@ -453,20 +459,13 @@ def run():
453459
print(f"Z-statistic: {z_stat:.4f}")
454460
print(f"P-value: {p_val:.4f}")
455461
456-
457-
458-
# Perform independent two-sample t-test (two-sided: average bias metric in most_biased_cluster_df ≠ average bias metric in rest_df)
459-
# t_stat, p_val = ttest_ind(bias_score_most_biased, bias_score_rest, alternative='two-sided')
460-
461-
# print(f"T-statistic: {t_stat}")
462-
# print(f"p-value: {p_val}")
463-
462+
464463
setResult(json.dumps({
465464
'type': 'text',
466465
'key': 'biasAnalysis.testingStatisticalSignificance',
467466
'params': {
468467
'p_val': "{:.3f}".format(p_val),
469-
'biasVariable': bias_score
468+
'biasVariable': bias_variable
470469
}
471470
}))
472471
@@ -475,7 +474,7 @@ def run():
475474
'titleKey': 'biasAnalysis.statisticDetailsTitle',
476475
'textKey': 'biasAnalysis.statisticDetailsContent',
477476
'params': {
478-
'mostBiasedClusterLabel':most_biased_cluster_label,
477+
'mostBiasedClusterLabel': int(most_biased_cluster_label),
479478
'mostBiasedCount': int(most_biased_count),
480479
'mostBiasedTotal': int(most_biased_total),
481480
'mostBiasedFactor': "{:.4f}".format(most_biased_count / most_biased_total),
@@ -491,7 +490,7 @@ def run():
491490
'type': 'text',
492491
'key': 'biasAnalysis.higherAverage' if p_val < 0.05 else 'biasAnalysis.noSignificance',
493492
'params': {
494-
'biasVariable': bias_score
493+
'biasVariable': bias_variable
495494
}
496495
}))
497496
@@ -506,13 +505,13 @@ def run():
506505
cluster_counts = decoded_X_test["cluster_label"].value_counts()
507506
print(f"cluster_counts: {cluster_counts}")
508507
509-
if p_val < 0.05:
510-
511-
setResult(json.dumps({
508+
setResult(json.dumps({
512509
'type': 'heading',
513510
'headingKey': 'biasAnalysis.distribution.mainHeading'
514511
}))
515-
512+
513+
if p_val < 0.05:
514+
516515
if localDataType == 'numeric':
517516
# Calculate mean per cluster for each variable
518517
means = test_df.groupby("cluster_label").mean()
@@ -523,7 +522,7 @@ def run():
523522
524523
dropdownCategories = []
525524
for i, column in enumerate(X_test.columns):
526-
if column != bias_score:
525+
if column != bias_variable:
527526
dropdownCategories.append(column)
528527
529528
# Plot bar charts for each variable, showing means for each cluster and overall mean as red line
@@ -564,8 +563,8 @@ def run():
564563
565564
else:
566565
# Create subplots for each column
567-
columns_to_analyze = [col for col in decoded_X_test.columns if col not in [bias_score, "cluster_label"]]
568-
566+
# columns_to_analyze = [col for col in decoded_X_test.columns if col not in [bias_variable, "cluster_label"]]
567+
columns_to_analyze = decoded_X_test.columns.drop(['cluster_label', bias_variable])
569568
570569
rows = (len(columns_to_analyze) + 2) // 3 # Calculate the number of rows needed
571570
print(f"rows: {rows}")
@@ -636,15 +635,15 @@ def run():
636635
if p_val < 0.05:
637636
if (localDataType == 'numeric'):
638637
639-
comparisons = t_test_on_cluster(test_df, bias_score, cluster_label=0)
638+
comparisons = t_test_on_cluster(test_df, bias_variable, cluster_label=0)
640639
641640
setResult(json.dumps({
642641
'type': 'accordion',
643642
'titleKey': 'biasAnalysis.biasedCluster.accordionTitle',
644643
'comparisons': comparisons
645644
}))
646645
else:
647-
comparisons = chi2_test_on_cluster(decoded_X_test, bias_score, cluster_label=0)
646+
comparisons = chi2_test_on_cluster(decoded_X_test, bias_variable, cluster_label=0)
648647
649648
setResult(json.dumps({
650649
'type': 'accordion',

0 commit comments

Comments
 (0)