@@ -50,19 +50,27 @@ def get_sensitive_feature(
5050 # Case 2: Numerical with prefix - CREATE BALANCED GROUPS
5151 if any (col == f"num__{ attr } " for col in test_df .columns ):
5252 col_name = f"num__{ attr } "
53- logger .info (f"Creating balanced groups for numerical column { col_name } " )
54-
53+ logger .info (
54+ f"Creating balanced groups for numerical column { col_name } "
55+ )
56+
5557 # For AGE_YEARS, create balanced age groups instead of continuous values
5658 if attr == "AGE_YEARS" :
5759 age_values = test_df [col_name ]
58- logger .info (f"Age range: { age_values .min ():.1f} - { age_values .max ():.1f} " )
59-
60+ logger .info (
61+ f"Age range: { age_values .min ():.1f} - { age_values .max ():.1f} "
62+ )
63+
6064 # Handle weird age preprocessing (might be standardized/normalized)
61- if age_values .min () < 10 : # Likely standardized or processed incorrectly
62- logger .info ("Detected non-standard age values, using percentile-based grouping" )
65+ if (
66+ age_values .min () < 10
67+ ): # Likely standardized or processed incorrectly
68+ logger .info (
69+ "Detected non-standard age values, using percentile-based grouping"
70+ )
6371 # Use percentiles to create balanced groups
6472 age_percentiles = age_values .quantile ([0.33 , 0.67 ]).values
65-
73+
6674 age_groups = []
6775 for age in age_values :
6876 if age <= age_percentiles [0 ]:
@@ -78,20 +86,29 @@ def get_sensitive_feature(
7886 if age < 35 :
7987 age_groups .append ("young_adult" ) # < 35
8088 elif age < 50 :
81- age_groups .append ("middle_age" ) # 35-50
89+ age_groups .append ("middle_age" ) # 35-50
8290 else :
83- age_groups .append ("mature" ) # 50+
84-
91+ age_groups .append ("mature" ) # 50+
92+
8593 age_series = pd .Series (age_groups , name = f"{ attr } _groups" )
86- logger .info (f"Age group distribution: { age_series .value_counts ().to_dict ()} " )
94+ logger .info (
95+ f"Age group distribution: { age_series .value_counts ().to_dict ()} "
96+ )
8797 return age_series , f"{ attr } _groups" , False
8898 else :
8999 # For other numerical attributes, create quantile-based groups
90100 try :
91- groups = pd .qcut (test_df [col_name ], q = 3 , duplicates = 'drop' , labels = ['low' , 'medium' , 'high' ])
101+ groups = pd .qcut (
102+ test_df [col_name ],
103+ q = 3 ,
104+ duplicates = "drop" ,
105+ labels = ["low" , "medium" , "high" ],
106+ )
92107 return groups , f"{ attr } _groups" , False
93108 except :
94- logger .warning (f"Could not create groups for { col_name } , using original values" )
109+ logger .warning (
110+ f"Could not create groups for { col_name } , using original values"
111+ )
95112 return test_df [col_name ], col_name , False
96113
97114 # Case 3: Categorical - reconstruct from one-hot encoding
@@ -114,22 +131,26 @@ def get_sensitive_feature(
114131 [cat_values .get (i , "Unknown" ) for i in range (len (test_df ))],
115132 name = attr ,
116133 )
117-
134+
118135 # For education, group into broader categories to prevent 0.000 DI ratios
119136 if attr == "NAME_EDUCATION_TYPE" :
120137 education_groups = []
121138 for edu in sensitive_features :
122- if "Higher education" in str (edu ) or "Academic degree" in str (edu ):
139+ if "Higher education" in str (edu ) or "Academic degree" in str (
140+ edu
141+ ):
123142 education_groups .append ("higher_education" )
124143 elif "Secondary" in str (edu ) or "Incomplete" in str (edu ):
125144 education_groups .append ("secondary_education" )
126145 else :
127146 education_groups .append ("other_education" )
128-
147+
129148 grouped_series = pd .Series (education_groups , name = f"{ attr } _groups" )
130- logger .info (f"Education group distribution: { grouped_series .value_counts ().to_dict ()} " )
149+ logger .info (
150+ f"Education group distribution: { grouped_series .value_counts ().to_dict ()} "
151+ )
131152 return grouped_series , f"{ attr } _groups" , False
132-
153+
133154 return sensitive_features , attr , False
134155
135156 # Case 4: Not found
@@ -171,9 +192,11 @@ def calculate_fairness_metrics(
171192 else :
172193 # Handle edge cases: only one group or no positive predictions
173194 disparate_impact_ratio = 1.0
174-
195+
175196 # Also calculate the old difference metric for backward compatibility
176- disparity_difference = frame .difference (method = "between_groups" )["selection_rate" ]
197+ disparity_difference = frame .difference (method = "between_groups" )[
198+ "selection_rate"
199+ ]
177200
178201 return {
179202 "selection_rate_by_group" : selection_rates .to_dict (),
@@ -222,8 +245,10 @@ def analyze_fairness(
222245 # Check for adverse impact using disparate impact ratio
223246 # DI ratio < 0.8 indicates adverse impact per four-fifths rule
224247 di_ratio = metrics ["disparate_impact_ratio" ]
225- di_threshold = approval_thresholds .get ("disparate_impact_threshold" , 0.8 )
226-
248+ di_threshold = approval_thresholds .get (
249+ "disparate_impact_threshold" , 0.8
250+ )
251+
227252 if di_ratio < di_threshold :
228253 bias_flag = True
229254 logger .warning (
0 commit comments