@@ -30,7 +30,7 @@ from js import higherIsBetter
3030from js import isDemo
3131from js import dataTypeText
3232
33- def t_test_on_cluster(test_df, bias_score , cluster_label):
33+ def t_test_on_cluster(test_df, bias_variable , cluster_label):
3434
3535 # Prepare results dictionary
3636 t_test_results = {}
@@ -40,7 +40,7 @@ def t_test_on_cluster(test_df, bias_score, cluster_label):
4040 cluster_df = test_df[test_df["cluster_label"] == cluster_label]
4141 rest_df = test_df[test_df["cluster_label"] != cluster_label]
4242
43- for var in test_df.drop(columns=[bias_score , "cluster_label"]).columns:
43+ for var in test_df.drop(columns=[bias_variable , "cluster_label"]).columns:
4444 # values in both partitions
4545 values_cluster = cluster_df[var]
4646 values_rest = rest_df[var]
@@ -92,7 +92,7 @@ def t_test_on_cluster(test_df, bias_score, cluster_label):
9292
9393 return comparisons
9494
95- def chi2_test_on_cluster(decoded_X_test, bias_score , cluster_label):
95+ def chi2_test_on_cluster(decoded_X_test, bias_variable , cluster_label):
9696
9797 comparisons = []
9898 # prepare results dictionary
@@ -108,7 +108,7 @@ def chi2_test_on_cluster(decoded_X_test, bias_score, cluster_label):
108108 alpha = 0.05
109109 alpha_adj = alpha/(p-2)
110110
111- for column in decoded_X_test.drop(columns=[bias_score , "cluster_label"]).columns:
111+ for column in decoded_X_test.drop(columns=[bias_variable , "cluster_label"]).columns:
112112 for value in list(decoded_X_test[column].unique()):
113113
114114 # create a 2x2 contingency table for this value: rows = [cluster_label, rest], columns = [value present, value absent]
@@ -197,11 +197,11 @@ def run():
197197 features = [col for col in df.columns if (col not in emptycols) and (col != targetColumn) and (not col.startswith('Unnamed'))]
198198
199199 if isDemo:
200- bias_score = "false_positive"
200+ bias_variable = "false_positive"
201201 localDataType = "categorical"
202202 localIterations = iterations # 20
203203
204- print (f"Using demo parameters: bias_score={bias_score }, targetColumn={targetColumn}, dataType={localDataType}, iterations={iterations}")
204+ print (f"Using demo parameters: bias_variable={bias_variable }, targetColumn={targetColumn}, dataType={localDataType}, iterations={iterations}")
205205
206206 # Select relevant columns
207207 columns_of_interest = ["age_cat", "sex", "race", "c_charge_degree", "is_recid", "score_text"]
@@ -215,15 +215,15 @@ def run():
215215 filtered_df["score_text"] = filtered_df["score_text"].map(lambda x: 1 if x == "High" else 0)
216216 filtered_df["is_recid"] = filtered_df["is_recid"].astype("category")
217217
218- filtered_df[bias_score ] = ((filtered_df["is_recid"] == 0) & (filtered_df["score_text"] == 1)).astype(int )
218+ filtered_df[bias_variable ] = ((filtered_df["is_recid"] == 0) & (filtered_df["score_text"] == 1))
219219
220220
221221 else:
222222 filtered_df = df
223- bias_score = targetColumn
223+ bias_variable = targetColumn
224224 localDataType = dataType
225225 localIterations = iterations
226- print (f"Using parameters: bias_score={bias_score }, targetColumn={targetColumn}, dataType={localDataType}, iterations={localIterations}")
226+ print (f"Using parameters: bias_variable={bias_variable }, targetColumn={targetColumn}, dataType={localDataType}, iterations={localIterations}")
227227
228228 if (dataType == 'numeric'):
229229 # Convert all columns to numeric
@@ -235,33 +235,35 @@ def run():
235235
236236 if localDataType == 'categorical':
237237 encoder = OrdinalEncoder()
238- filtered_df[filtered_df.columns] = encoder.fit_transform(filtered_df).astype("int64")
238+ columns_to_encode = [col for col in filtered_df.columns if col != bias_variable]
239+ filtered_df[columns_to_encode] = encoder.fit_transform(filtered_df[columns_to_encode])
240+ # filtered_df[filtered_df.columns] = encoder.fit_transform(filtered_df).astype("int64")
239241
240242 print("filtered_df.dtypes:")
241243 print(filtered_df.dtypes)
242244
243- df_no_bias_score = filtered_df.drop(columns=[bias_score ])
244- if df_no_bias_score .dtypes.nunique() == 1:
245+ df_no_bias_variable = filtered_df.drop(columns=[bias_variable ])
246+ if df_no_bias_variable .dtypes.nunique() == 1:
245247 print('consistent data')
246248 else:
247249 print('not all columns in the provided dataset have the same data type')
248250
249-
250- # split the data into training and testing sets
251- train_df, test_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
252- X_train = train_df.drop(columns=[bias_score])
253-
254- scaleY = 1
251+ # Multiply bias variable with -1 if "Lower value of bias score is better", multiply by 1 if "Higher value of bias score is better"
252+ interpretationScalar = 1
255253 if higherIsBetter == 0:
256- scaleY = -1;
257-
254+ interpretationScalar = -1;
258255
256+ filtered_df[bias_variable] = filtered_df[bias_variable] * interpretationScalar
259257
260- # bias metric is negated because HBAC implementation in the package assumes that higher bias metric is better
261- y_train = train_df[bias_score] * scaleY
258+ # split the data into training and testing sets
259+ train_df, test_df = train_test_split(filtered_df, test_size=0.2, random_state=42)
260+ X_train = train_df.drop(columns=[bias_variable])
261+ y_train = train_df[bias_variable]
262262
263263 # remove the bias metric from the test set to prevent issues with decoding later
264- X_test = test_df.drop(columns=[bias_score])
264+ X_test = test_df.drop(columns=[bias_variable])
265+ y_test = test_df[bias_variable]
266+
265267
266268 # display the shapes of the resulting datasets
267269 print(f"Training set shape: {train_df.shape}")
@@ -277,7 +279,7 @@ def run():
277279
278280 print(f"Using local iterations: {localIterations}")
279281 print(f"Using cluster size: {localClusterSize}")
280- print(f"Using bias metric: {bias_score }")
282+ print(f"Using bias metric: {bias_variable }")
281283
282284
283285 if localDataType == 'numeric':
@@ -341,7 +343,7 @@ def run():
341343 'params': {
342344 'iterations': localIterations,
343345 'minClusterSize': localClusterSize,
344- 'performanceMetric': bias_score ,
346+ 'performanceMetric': bias_variable ,
345347 'dataType': dataTypeText,
346348 'higherIsBetter': 'biasAnalysis.higherIsBetter' if higherIsBetter else 'biasAnalysis.lowerIsBetter'
347349 }
@@ -398,27 +400,31 @@ def run():
398400 'data': ''
399401 }))
400402
401- y_test = hbac.predict(X_test.to_numpy())
403+ cluster_label_X_test = hbac.predict(X_test.to_numpy())
402404
403405 decoded_X_test = test_df.copy()
404406
405- print("y_test :")
406- print(y_test )
407+ print("cluster_label_X_test :")
408+ print(cluster_label_X_test )
407409 print("test_df:")
408410 print(test_df)
409411
410412 if localDataType == 'categorical':
411413 # decode X_test using the encoder
412- decoded_X_test = encoder.inverse_transform(test_df)
414+ test_df_pred = test_df[columns_to_encode]
415+ decoded_X_test = encoder.inverse_transform(test_df_pred)
413416
414417
415418 # display the decoded DataFrame
416- decoded_X_test = pd.DataFrame(decoded_X_test, columns=test_df .columns)
419+ decoded_X_test = pd.DataFrame(decoded_X_test, columns=test_df_pred .columns)
417420 print(decoded_X_test)
418421
419422
420- decoded_X_test["cluster_label"] = y_test
421-
423+ # decoded_X_test["cluster_label"] = cluster_label_X_test
424+ decoded_X_test[bias_variable] = y_test.values
425+ decoded_X_test["cluster_label"] = cluster_label_X_test
426+
427+
422428 if localDataType == 'numeric':
423429 test_df["cluster_label"] = y_test
424430 most_biased_cluster_df = test_df[test_df["cluster_label"] == 0]
@@ -427,20 +433,20 @@ def run():
427433 most_biased_cluster_df = decoded_X_test[decoded_X_test["cluster_label"] == 0]
428434 rest_df = decoded_X_test[decoded_X_test["cluster_label"] != 0]
429435
430- # Convert score_text to numeric
431- bias_score_most_biased = pd.to_numeric(most_biased_cluster_df[bias_score])
432- bias_score_rest = pd.to_numeric(rest_df[bias_score])
433436
434-
435- # most disavanteagous bias variable is always minimum value of the bias variable
436- most_biased_cluster_label = most_biased_cluster_df[bias_score].min()
437437
438+ # most disavanteagous bias variable is always minimum value of the bias variable
439+ most_biased_cluster_label = most_biased_cluster_df[bias_variable].min()
440+
441+
438442 # Perform Z-test for proportions
439- most_biased_count = (most_biased_cluster_df[bias_score ] == most_biased_cluster_label).sum()
443+ most_biased_count = (most_biased_cluster_df[bias_variable ] == most_biased_cluster_label).sum()
440444 most_biased_total = len(most_biased_cluster_df)
441- rest_count = (rest_df[bias_score ] == most_biased_cluster_label).sum()
445+ rest_count = (rest_df[bias_variable ] == most_biased_cluster_label).sum()
442446 rest_total = len(rest_df)
443447
448+
449+
444450 # Perform two-proportion z-test
445451 counts = np.array([most_biased_count, rest_count])
446452 nobs = np.array([most_biased_total, rest_total])
@@ -453,20 +459,13 @@ def run():
453459 print(f"Z-statistic: {z_stat:.4f}")
454460 print(f"P-value: {p_val:.4f}")
455461
456-
457-
458- # Perform independent two-sample t-test (two-sided: average bias metric in most_biased_cluster_df ≠ average bias metric in rest_df)
459- # t_stat, p_val = ttest_ind(bias_score_most_biased, bias_score_rest, alternative='two-sided')
460-
461- # print(f"T-statistic: {t_stat}")
462- # print(f"p-value: {p_val}")
463-
462+
464463 setResult(json.dumps({
465464 'type': 'text',
466465 'key': 'biasAnalysis.testingStatisticalSignificance',
467466 'params': {
468467 'p_val': "{:.3f}".format(p_val),
469- 'biasVariable': bias_score
468+ 'biasVariable': bias_variable
470469 }
471470 }))
472471
@@ -475,7 +474,7 @@ def run():
475474 'titleKey': 'biasAnalysis.statisticDetailsTitle',
476475 'textKey': 'biasAnalysis.statisticDetailsContent',
477476 'params': {
478- 'mostBiasedClusterLabel':most_biased_cluster_label,
477+ 'mostBiasedClusterLabel': int( most_biased_cluster_label) ,
479478 'mostBiasedCount': int(most_biased_count),
480479 'mostBiasedTotal': int(most_biased_total),
481480 'mostBiasedFactor': "{:.4f}".format(most_biased_count / most_biased_total),
@@ -491,7 +490,7 @@ def run():
491490 'type': 'text',
492491 'key': 'biasAnalysis.higherAverage' if p_val < 0.05 else 'biasAnalysis.noSignificance',
493492 'params': {
494- 'biasVariable': bias_score
493+ 'biasVariable': bias_variable
495494 }
496495 }))
497496
@@ -506,13 +505,13 @@ def run():
506505 cluster_counts = decoded_X_test["cluster_label"].value_counts()
507506 print(f"cluster_counts: {cluster_counts}")
508507
509- if p_val < 0.05:
510-
511- setResult(json.dumps({
508+ setResult(json.dumps({
512509 'type': 'heading',
513510 'headingKey': 'biasAnalysis.distribution.mainHeading'
514511 }))
515-
512+
513+ if p_val < 0.05:
514+
516515 if localDataType == 'numeric':
517516 # Calculate mean per cluster for each variable
518517 means = test_df.groupby("cluster_label").mean()
@@ -523,7 +522,7 @@ def run():
523522
524523 dropdownCategories = []
525524 for i, column in enumerate(X_test.columns):
526- if column != bias_score :
525+ if column != bias_variable :
527526 dropdownCategories.append(column)
528527
529528 # Plot bar charts for each variable, showing means for each cluster and overall mean as red line
@@ -564,8 +563,8 @@ def run():
564563
565564 else:
566565 # Create subplots for each column
567- columns_to_analyze = [col for col in decoded_X_test.columns if col not in [bias_score , "cluster_label"]]
568-
566+ # columns_to_analyze = [col for col in decoded_X_test.columns if col not in [bias_variable , "cluster_label"]]
567+ columns_to_analyze = decoded_X_test.columns.drop(['cluster_label', bias_variable])
569568
570569 rows = (len(columns_to_analyze) + 2) // 3 # Calculate the number of rows needed
571570 print(f"rows: {rows}")
@@ -636,15 +635,15 @@ def run():
636635 if p_val < 0.05:
637636 if (localDataType == 'numeric'):
638637
639- comparisons = t_test_on_cluster(test_df, bias_score , cluster_label=0)
638+ comparisons = t_test_on_cluster(test_df, bias_variable , cluster_label=0)
640639
641640 setResult(json.dumps({
642641 'type': 'accordion',
643642 'titleKey': 'biasAnalysis.biasedCluster.accordionTitle',
644643 'comparisons': comparisons
645644 }))
646645 else:
647- comparisons = chi2_test_on_cluster(decoded_X_test, bias_score , cluster_label=0)
646+ comparisons = chi2_test_on_cluster(decoded_X_test, bias_variable , cluster_label=0)
648647
649648 setResult(json.dumps({
650649 'type': 'accordion',
0 commit comments