@@ -165,58 +165,6 @@ def chi2_test_on_cluster(decoded_X_test, bias_score, cluster_label):
165165
166166 return comparisons
167167
168- def diffDataframe(df, features, type=None, cluster1=None, cluster2=None):
169- '''
170- Creates difference dataframe, for numerical and categorical
171- data: Takes dataframe of two clusters of interest and
172- computes difference in means. Default to analyze most deviating
173- cluster vs rest of the dataset, except specified otherwise.
174- '''
175- # Cluster comparison (optional)
176- if cluster1 != None and cluster2 != None:
177- df1 = df[df['Cluster'] == cluster1]
178- df2 = df[df['Cluster'] == cluster2]
179- else:
180- df1 = df[df['Cluster'] == 0]
181- df2 = df[df['Cluster'] != 0]
182-
183- n_df1 = df1.shape[0]
184- n_df2 = df2.shape[0]
185-
186- diff_dict = {}
187- CI_dict = {}
188-
189- for feat in features:
190- sample1 = df1[feat]
191- sample2 = df2[feat]
192-
193- if type == 'Numerical':
194- mean1 = np.mean(sample1)
195- mean2 = np.mean(sample2)
196- diff = mean1 - mean2
197- diff_dict[feat] = diff
198- else:
199- freq1 = sample1.value_counts()
200- freq2 = sample2.value_counts()
201- diff = freq1 - freq2
202- diff_dict[feat] = diff
203-
204- if type == 'Numerical':
205- pd.set_option('display.float_format', lambda x: '%.5f' % x)
206- diff_df = pd.DataFrame.from_dict(diff_dict, orient='index', columns=['Difference'])
207- else:
208- diff_df = pd.DataFrame()
209- pd.set_option('display.float_format', lambda x: '%.5f' % x)
210-
211- for _, value in diff_dict.items():
212- df_temp = pd.DataFrame(value)
213- diff_df = pd.concat([diff_df,df_temp], axis=0,)
214-
215- diff_df = diff_df.fillna(0)
216- diff_df.columns = ['Difference']
217-
218- return(diff_df)
219-
220168def run():
221169 csv_data = StringIO(data)
222170 df = pd.read_csv(csv_data)
@@ -227,7 +175,7 @@ def run():
227175 if isDemo:
228176 bias_score = "false_positive"
229177 localDataType = "categorical"
230- localIterations = 20
178+ localIterations = iterations # 20
231179
232180 print (f"Using demo parameters: bias_score={bias_score}, targetColumn={targetColumn}, dataType={localDataType}, iterations={iterations}")
233181
@@ -297,7 +245,7 @@ def run():
297245 print(f"X_train shape: {X_train.shape}")
298246
299247 if isDemo:
300- localClusterSize = X_train.shape[0]*0.01
248+ localClusterSize = clusterSize # X_train.shape[0]*0.01
301249 else:
302250 localClusterSize = clusterSize
303251
@@ -317,7 +265,7 @@ def run():
317265 cluster_df = pd.DataFrame(hbac.scores_, columns=['Cluster scores'])
318266
319267 n_most_bias = np.sum(hbac.labels_ == 0) # number of datapoints in the most deviating cluster ... user should be able to select the cluster to analyze
320- print(f"Number of datapoints in most deviating cluster: {n_most_bias}/{train_df.shape[0]}")
268+ # print(f"Number of datapoints in most deviating cluster: {n_most_bias}/{train_df.shape[0]}")
321269 print(f"Number of clusters: {hbac.n_clusters_}")
322270 print(f"Bias metric scores: {hbac.scores_}")
323271
@@ -328,7 +276,9 @@ def run():
328276
329277 # df['Cluster'] = hbac.labels_
330278
331-
279+ biasInClusters = []
280+ for i in range(clusterCount):
281+ biasInClusters.append( int(np.sum(hbac.labels_ == i)))
332282
333283
334284 if isDemo:
@@ -404,6 +354,15 @@ def run():
404354 'clusterCount': clusterCount
405355 }
406356 }))
357+
358+ setResult(json.dumps({
359+ 'type': 'TextValueSelect',
360+ 'key': 'biasAnalysis.clusterinResults.description',
361+ 'defaultIndex': 0,
362+ 'labelKey': 'biasAnalysis.clusterinResults.label',
363+ 'valueKey' : 'biasAnalysis.clusterinResults.valueText',
364+ 'values': biasInClusters
365+ }))
407366 setResult(json.dumps({
408367 'type': 'text',
409368 'data': ''
@@ -457,19 +416,23 @@ def run():
457416 'type': 'text',
458417 'key': 'biasAnalysis.testingStatisticalSignificance',
459418 'params': {
460- 't_stat': t_stat,
461- 'p_val': p_val
419+ 'p_val': "{:.3f}".format(p_val)
462420 }
463421 }))
464422
423+ setResult(json.dumps({
424+ 'type': 'text',
425+ 'key': 'biasAnalysis.higherAverage' if p_val < 0.05 else 'biasAnalysis.noSignificance'
426+ }))
427+
465428 if p_val < 0.05:
466429 print("The most biased cluster has a significantly higher average bias metric than the rest of the dataset.")
467430 else:
468431 print("No significant difference in average bias metric between the most biased cluster and the rest of the dataset.")
469- setResult(json.dumps({
470- 'type': 'heading',
471- 'headingKey': 'biasAnalysis.nodifference.heading',
472- }))
432+ # setResult(json.dumps({
433+ # 'type': 'heading',
434+ # 'headingKey': 'biasAnalysis.nodifference.heading',
435+ # }))
473436
474437 setResult(json.dumps({
475438 'type': 'heading',
@@ -492,60 +455,56 @@ def run():
492455 variables = X_test.columns.tolist()
493456 overall_means = test_df[variables].mean()
494457
458+ dropdownCategories = []
459+ for i, column in enumerate(X_test.columns):
460+ dropdownCategories.append(column)
461+
495462 # Plot bar charts for each variable, showing means for each cluster and overall mean as red line
496463 n_vars = len(variables)
497464 n_cols = 2
498465 n_rows = int(np.ceil(n_vars / n_cols))
499466
467+ charts = []
468+
500469 for i, var in enumerate(variables):
501470
502- setResult(json.dumps({
503- 'type': 'heading',
504- 'headingKey': 'biasAnalysis.distribution.heading',
505- 'params': {'variable': var}
506- }))
471+ # setResult(json.dumps({
472+ # 'type': 'heading',
473+ # 'headingKey': 'biasAnalysis.distribution.heading',
474+ # 'params': {'variable': var}
475+ # }))
507476 print(f"means: {var}")
508477 print(overall_means[var])
509478 print(means[var])
510479 print(f"========================")
511- setResult(json.dumps({
512- 'type': 'barchart',
513- 'title': var,
514- 'meanValue': overall_means[var],
515- 'data': means[var].to_json(orient='records')
516- }))
517-
518- if p_val < 0.05:
519-
520- if localDataType == 'numeric':
521- # Calculate mean per cluster for each variable
522- means = test_df.groupby("cluster_label").mean()
523-
524- # Calculate overall mean for each variable (excluding cluster_label)
525- variables = X_test.columns.tolist()
526- overall_means = test_df[variables].mean()
527-
528- # Plot bar charts for each variable, showing means for each cluster and overall mean as red line
529- n_vars = len(variables)
530- n_cols = 2
531- n_rows = int(np.ceil(n_vars / n_cols))
480+
532481
533- for i, var in enumerate(variables):
534-
535- setResult(json.dumps({
536- 'type': 'heading',
537- 'headingKey': 'biasAnalysis.distribution.heading',
538- 'params': {'variable': var}
539- }))
540-
541- setResult(json.dumps({
542- 'type': 'barchart',
482+ charts.append({
483+ 'yAxisLabel': 'distribution.frequency',
484+ 'type': 'clusterNumericalVariableDistribution',
485+ 'headingKey': 'biasAnalysis.distribution.heading',
543486 'title': var,
487+ 'meanValue': overall_means[var],
544488 'data': means[var].to_json(orient='records'),
545- 'meanValue': overall_means[var]
546- }))
489+ 'params': {'variable': var},
490+ 'selectFilterGroup' : var,
491+ 'defaultFilter': X_test.columns[0]
492+ })
547493
494+ setResult(json.dumps({
495+ 'type': 'clusterNumericalVariableDistributionAccordeon',
496+ 'clusterCount': clusterCount,
497+ 'charts': charts,
498+ 'values': dropdownCategories,
499+ 'titleKey': "biasAnalysis.numericalVariableDistributionAcrossClustersAccordeonTitle",
500+ 'defaultValue': X_test.columns[0]
501+ }))
502+
503+ if p_val < 0.05:
548504
505+ if localDataType == 'numeric':
506+ # see above for the code
507+ print("Statistically significant differences in means found.")
549508 else:
550509 # Create subplots for each column
551510 columns_to_analyze = decoded_X_test.columns[:-1] # Exclude 'cluster_label' column
@@ -556,11 +515,7 @@ def run():
556515 for i, column in enumerate(columns_to_analyze):
557516 dropdownCategories.append(column)
558517
559- setResult(json.dumps({
560- 'type': 'clusterCategorieSelect',
561- 'values': dropdownCategories,
562- 'defaultValue': columns_to_analyze[0]
563- }))
518+ charts = []
564519
565520 for i, column in enumerate(columns_to_analyze):
566521 print(f"Analyzing column: {column}")
@@ -569,22 +524,39 @@ def run():
569524 percentages = grouped_data.div(grouped_data.sum(axis=1), axis=0) * 100
570525
571526 category_values = grouped_data.columns.tolist()
572-
573527
574- setResult(json.dumps({
528+ means = []
529+ overall_counts = decoded_X_test[column].value_counts(normalize=True) * 100
530+ for cat_value, avg_pct in overall_counts.items():
531+ means.append({
532+ 'category': cat_value,
533+ 'mean': avg_pct
534+ })
535+
536+
537+ charts.append({
575538 'type': 'clusterCategorieDistribution',
576539 'headingKey': 'biasAnalysis.distribution.heading',
577540 'title': column,
578541 'categories': category_values,
579542 'data': percentages.T.to_json(orient='records'),
580543 'selectFilterGroup' : column,
581544 'params': {'variable': column},
582- 'defaultFilter': columns_to_analyze[0]
583- }))
545+ 'defaultFilter': columns_to_analyze[0],
546+ 'means': means,
547+ 'isViridis': True,
548+ 'yAxisLabel': 'distribution.frequency'
549+ })
550+
551+
584552
585553 setResult(json.dumps({
586- 'type': 'cluster_legend ',
554+ 'type': 'clusterCategorieDistributionAccordeon ',
587555 'clusterCount': clusterCount,
556+ 'charts': charts,
557+ 'values': dropdownCategories,
558+ 'titleKey': "biasAnalysis.distributionOfFeaturesAcrossClustersAccordeonTitle",
559+ 'defaultValue': columns_to_analyze[0]
588560 }))
589561
590562 df_most_biased_cluster = most_biased_cluster_df
@@ -623,6 +595,16 @@ def run():
623595 'headingKey': 'biasAnalysis.conclusion'
624596 }))
625597
598+ setResult(json.dumps({
599+ 'type': 'text',
600+ 'key': 'biasAnalysis.conclusionDescription'
601+ }))
602+
603+ setResult(json.dumps({
604+ 'type': 'export-button',
605+ }))
606+
607+
626608 setResult(json.dumps({
627609 'type': 'heading',
628610 'headingKey': 'biasAnalysis.moreInformationHeading'
0 commit comments