Skip to content

Commit 2a77a2d

Browse files
author
marwan37
committed
run format script
1 parent f8d2c2d commit 2a77a2d

File tree

6 files changed

+91
-61
lines changed

6 files changed

+91
-61
lines changed

credit-scorer/src/steps/training/risk_assessment.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ def score_risk(evaluation: Dict) -> Dict[str, float]:
9393
else:
9494
# Convert DI ratio to risk score: 1.0 DI ratio = 0 risk, 0.8 DI ratio = 0.25 risk
9595
risk_bias = max(0.0, (0.8 - min_di_ratio) / 0.8)
96-
96+
9797
overall = round(min(1.0, 0.5 * risk_auc + 0.5 * risk_bias), 3)
9898

9999
return {
@@ -239,11 +239,11 @@ def risk_assessment(
239239
isinstance(metrics, dict)
240240
and "selection_rate_disparity" in metrics
241241
):
242-
di_ratio = metrics.get("disparate_impact_ratio", 1.0)
242+
di_ratio = metrics.get(
243+
"disparate_impact_ratio", 1.0
244+
)
243245
if di_ratio < 0.8: # Adverse impact threshold
244-
details += (
245-
f"{attr}: {di_ratio:.3f} DI ratio (< 0.8 indicates adverse impact)\n"
246-
)
246+
details += f"{attr}: {di_ratio:.3f} DI ratio (< 0.8 indicates adverse impact)\n"
247247

248248
article = get_article_for_hazard(hz["id"])
249249
hazard_sheet.append(

credit-scorer/src/utils/eval.py

Lines changed: 47 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -50,19 +50,27 @@ def get_sensitive_feature(
5050
# Case 2: Numerical with prefix - CREATE BALANCED GROUPS
5151
if any(col == f"num__{attr}" for col in test_df.columns):
5252
col_name = f"num__{attr}"
53-
logger.info(f"Creating balanced groups for numerical column {col_name}")
54-
53+
logger.info(
54+
f"Creating balanced groups for numerical column {col_name}"
55+
)
56+
5557
# For AGE_YEARS, create balanced age groups instead of continuous values
5658
if attr == "AGE_YEARS":
5759
age_values = test_df[col_name]
58-
logger.info(f"Age range: {age_values.min():.1f} - {age_values.max():.1f}")
59-
60+
logger.info(
61+
f"Age range: {age_values.min():.1f} - {age_values.max():.1f}"
62+
)
63+
6064
# Handle weird age preprocessing (might be standardized/normalized)
61-
if age_values.min() < 10: # Likely standardized or processed incorrectly
62-
logger.info("Detected non-standard age values, using percentile-based grouping")
65+
if (
66+
age_values.min() < 10
67+
): # Likely standardized or processed incorrectly
68+
logger.info(
69+
"Detected non-standard age values, using percentile-based grouping"
70+
)
6371
# Use percentiles to create balanced groups
6472
age_percentiles = age_values.quantile([0.33, 0.67]).values
65-
73+
6674
age_groups = []
6775
for age in age_values:
6876
if age <= age_percentiles[0]:
@@ -78,20 +86,29 @@ def get_sensitive_feature(
7886
if age < 35:
7987
age_groups.append("young_adult") # < 35
8088
elif age < 50:
81-
age_groups.append("middle_age") # 35-50
89+
age_groups.append("middle_age") # 35-50
8290
else:
83-
age_groups.append("mature") # 50+
84-
91+
age_groups.append("mature") # 50+
92+
8593
age_series = pd.Series(age_groups, name=f"{attr}_groups")
86-
logger.info(f"Age group distribution: {age_series.value_counts().to_dict()}")
94+
logger.info(
95+
f"Age group distribution: {age_series.value_counts().to_dict()}"
96+
)
8797
return age_series, f"{attr}_groups", False
8898
else:
8999
# For other numerical attributes, create quantile-based groups
90100
try:
91-
groups = pd.qcut(test_df[col_name], q=3, duplicates='drop', labels=['low', 'medium', 'high'])
101+
groups = pd.qcut(
102+
test_df[col_name],
103+
q=3,
104+
duplicates="drop",
105+
labels=["low", "medium", "high"],
106+
)
92107
return groups, f"{attr}_groups", False
93108
except:
94-
logger.warning(f"Could not create groups for {col_name}, using original values")
109+
logger.warning(
110+
f"Could not create groups for {col_name}, using original values"
111+
)
95112
return test_df[col_name], col_name, False
96113

97114
# Case 3: Categorical - reconstruct from one-hot encoding
@@ -114,22 +131,26 @@ def get_sensitive_feature(
114131
[cat_values.get(i, "Unknown") for i in range(len(test_df))],
115132
name=attr,
116133
)
117-
134+
118135
# For education, group into broader categories to prevent 0.000 DI ratios
119136
if attr == "NAME_EDUCATION_TYPE":
120137
education_groups = []
121138
for edu in sensitive_features:
122-
if "Higher education" in str(edu) or "Academic degree" in str(edu):
139+
if "Higher education" in str(edu) or "Academic degree" in str(
140+
edu
141+
):
123142
education_groups.append("higher_education")
124143
elif "Secondary" in str(edu) or "Incomplete" in str(edu):
125144
education_groups.append("secondary_education")
126145
else:
127146
education_groups.append("other_education")
128-
147+
129148
grouped_series = pd.Series(education_groups, name=f"{attr}_groups")
130-
logger.info(f"Education group distribution: {grouped_series.value_counts().to_dict()}")
149+
logger.info(
150+
f"Education group distribution: {grouped_series.value_counts().to_dict()}"
151+
)
131152
return grouped_series, f"{attr}_groups", False
132-
153+
133154
return sensitive_features, attr, False
134155

135156
# Case 4: Not found
@@ -171,9 +192,11 @@ def calculate_fairness_metrics(
171192
else:
172193
# Handle edge cases: only one group or no positive predictions
173194
disparate_impact_ratio = 1.0
174-
195+
175196
# Also calculate the old difference metric for backward compatibility
176-
disparity_difference = frame.difference(method="between_groups")["selection_rate"]
197+
disparity_difference = frame.difference(method="between_groups")[
198+
"selection_rate"
199+
]
177200

178201
return {
179202
"selection_rate_by_group": selection_rates.to_dict(),
@@ -222,8 +245,10 @@ def analyze_fairness(
222245
# Check for adverse impact using disparate impact ratio
223246
# DI ratio < 0.8 indicates adverse impact per four-fifths rule
224247
di_ratio = metrics["disparate_impact_ratio"]
225-
di_threshold = approval_thresholds.get("disparate_impact_threshold", 0.8)
226-
248+
di_threshold = approval_thresholds.get(
249+
"disparate_impact_threshold", 0.8
250+
)
251+
227252
if di_ratio < di_threshold:
228253
bias_flag = True
229254
logger.warning(

credit-scorer/src/utils/preprocess.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
2020

2121
class DeriveAgeFeatures(BaseEstimator, TransformerMixin):
2222
"""Create AGE_YEARS and EMPLOYMENT_YEARS from DAYS_BIRTH / DAYS_EMPLOYED.
23-
23+
2424
Implements fairness-aware age discretization to reduce bias.
2525
"""
2626

@@ -33,42 +33,48 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
3333
# Derive AGE_YEARS with fairness-aware binning
3434
if "DAYS_BIRTH" in df:
3535
age_years = -df["DAYS_BIRTH"] / 365.25
36-
36+
3737
# Create balanced age bins to reduce bias
3838
# Use quantile-based binning instead of fixed ranges
3939
df["AGE_YEARS"] = age_years
40-
40+
4141
# Add age-related features that are less biased
42-
df["AGE_SQUARED"] = age_years ** 2 # Non-linear age effect
43-
df["AGE_LOG"] = pd.Series(age_years).apply(lambda x: np.log(max(x, 18)))
44-
42+
df["AGE_SQUARED"] = age_years**2 # Non-linear age effect
43+
df["AGE_LOG"] = pd.Series(age_years).apply(
44+
lambda x: np.log(max(x, 18))
45+
)
46+
4547
# Create broad age categories to reduce granular bias
4648
df["AGE_CATEGORY"] = pd.cut(
47-
age_years,
48-
bins=[0, 35, 50, 65, 100],
49-
labels=["young", "middle", "mature", "senior"]
49+
age_years,
50+
bins=[0, 35, 50, 65, 100],
51+
labels=["young", "middle", "mature", "senior"],
5052
).astype(str)
51-
53+
5254
df = df.drop(columns=["DAYS_BIRTH"])
5355

5456
# Derive EMPLOYMENT_YEARS with stability indicators
5557
if "DAYS_EMPLOYED" in df:
5658
# Handle the special case of 365243 (unemployed marker)
5759
employment_days = df["DAYS_EMPLOYED"].copy()
58-
60+
5961
# Replace the unemployed marker with 0
6062
employment_days = employment_days.replace(365243, 0)
61-
63+
6264
df["EMPLOYMENT_YEARS"] = employment_days.apply(
6365
lambda x: abs(x) / 365.25 if x < 0 else 0
6466
)
65-
67+
6668
# Add employment stability features
6769
df["IS_EMPLOYED"] = (employment_days < 0).astype(int)
6870
df["EMPLOYMENT_STABILITY"] = df["EMPLOYMENT_YEARS"].apply(
69-
lambda x: "stable" if x > 2 else "new" if x > 0 else "unemployed"
71+
lambda x: "stable"
72+
if x > 2
73+
else "new"
74+
if x > 0
75+
else "unemployed"
7076
)
71-
77+
7278
df = df.drop(columns=["DAYS_EMPLOYED"])
7379

7480
return df

credit-scorer/src/utils/visualizations/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@
1717

1818
"""HTML component utilities for rendering compliance dashboards."""
1919

20+
from .dashboard import generate_compliance_dashboard_html
2021
from .eval import generate_eval_visualization
2122
from .whylogs import generate_whylogs_visualization
22-
from .dashboard import generate_compliance_dashboard_html
2323

2424
__all__ = [
2525
"generate_eval_visualization",
2626
"generate_whylogs_visualization",
2727
"generate_compliance_dashboard_html",
28-
]
28+
]

credit-scorer/src/utils/visualizations/dashboard.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -676,15 +676,6 @@ def generate_compliance_dashboard_html(
676676
compliance_percentage = compliance_summary.get("overall_score", 0)
677677
last_release_id = compliance_summary.get("release_id", "Unknown")
678678

679-
# Determine color based on compliance score
680-
bar_color = (
681-
"#D64045"
682-
if compliance_percentage < 60
683-
else "#FFB30F"
684-
if compliance_percentage < 80
685-
else "#478C5C"
686-
)
687-
688679
# Determine status text
689680
if compliance_percentage >= 80:
690681
status_text = "High Compliance"

credit-scorer/src/utils/visualizations/whylogs.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from zenml.types import HTMLString
77

88

9-
109
def _format_num(val: Any, precision: int = 6) -> str:
1110
"""Convert a numeric value to string, trim trailing zeros & dots."""
1211
try:
@@ -103,20 +102,28 @@ def generate_whylogs_visualization(
103102
(m for m in metrics if m in ("counts/null", "counts/nan")),
104103
None,
105104
)
106-
unique_metric = next((m for m in metrics if m.startswith("cardinality/")), None)
105+
unique_metric = next(
106+
(m for m in metrics if m.startswith("cardinality/")), None
107+
)
107108
min_metric = next((m for m in metrics if m.endswith("/min")), None)
108109
max_metric = next((m for m in metrics if m.endswith("/max")), None)
109-
mean_metric = next((m for m in metrics if m.endswith("/mean")), None)
110+
mean_metric = next(
111+
(m for m in metrics if m.endswith("/mean")), None
112+
)
110113

111114
# Get values with error handling
112115
count_val = row[count_metric] if count_metric else "N/A"
113116
null_val = row[null_metric] if null_metric else "N/A"
114117

115118
# Format the values below
116-
unique_val = _format_num(row[unique_metric], 0) if unique_metric else "N/A"
119+
unique_val = (
120+
_format_num(row[unique_metric], 0) if unique_metric else "N/A"
121+
)
117122
min_val = _format_num(row[min_metric]) if min_metric else "N/A"
118123
max_val = _format_num(row[max_metric]) if max_metric else "N/A"
119-
mean_val = _format_num(row[mean_metric], 4) if mean_metric else "N/A"
124+
mean_val = (
125+
_format_num(row[mean_metric], 4) if mean_metric else "N/A"
126+
)
120127

121128
html_content += f"""
122129
<tr>
@@ -143,7 +150,10 @@ def generate_whylogs_visualization(
143150
"""
144151

145152
# Add section about sensitive attributes if they exist
146-
if "sensitive_attributes" in dataset_info and dataset_info["sensitive_attributes"]:
153+
if (
154+
"sensitive_attributes" in dataset_info
155+
and dataset_info["sensitive_attributes"]
156+
):
147157
html_content += """
148158
<div class="alert">
149159
<h3>Sensitive Attributes Detected</h3>
@@ -166,5 +176,3 @@ def generate_whylogs_visualization(
166176
"""
167177

168178
return HTMLString(html_content)
169-
170-

0 commit comments

Comments
 (0)