run format script

marwan37 · marwan37 · commit 2a77a2d630c5 · 2025-05-22T20:13:10.000-05:00
diff --git a/credit-scorer/src/steps/training/risk_assessment.py b/credit-scorer/src/steps/training/risk_assessment.py
@@ -93,7 +93,7 @@ def score_risk(evaluation: Dict) -> Dict[str, float]:
     else:
         # Convert DI ratio to risk score: 1.0 DI ratio = 0 risk, 0.8 DI ratio = 0.25 risk
         risk_bias = max(0.0, (0.8 - min_di_ratio) / 0.8)
-    
+
     overall = round(min(1.0, 0.5 * risk_auc + 0.5 * risk_bias), 3)
 
     return {
@@ -239,11 +239,11 @@ def risk_assessment(
                             isinstance(metrics, dict)
                             and "selection_rate_disparity" in metrics
                         ):
-                            di_ratio = metrics.get("disparate_impact_ratio", 1.0)
+                            di_ratio = metrics.get(
+                                "disparate_impact_ratio", 1.0
+                            )
                             if di_ratio < 0.8:  # Adverse impact threshold
-                                details += (
-                                    f"{attr}: {di_ratio:.3f} DI ratio (< 0.8 indicates adverse impact)\n"
-                                )
+                                details += f"{attr}: {di_ratio:.3f} DI ratio (< 0.8 indicates adverse impact)\n"
 
             article = get_article_for_hazard(hz["id"])
             hazard_sheet.append(
diff --git a/credit-scorer/src/utils/eval.py b/credit-scorer/src/utils/eval.py
@@ -50,19 +50,27 @@ def get_sensitive_feature(
     # Case 2: Numerical with prefix - CREATE BALANCED GROUPS
     if any(col == f"num__{attr}" for col in test_df.columns):
         col_name = f"num__{attr}"
-        logger.info(f"Creating balanced groups for numerical column {col_name}")
-        
+        logger.info(
+            f"Creating balanced groups for numerical column {col_name}"
+        )
+
         # For AGE_YEARS, create balanced age groups instead of continuous values
         if attr == "AGE_YEARS":
             age_values = test_df[col_name]
-            logger.info(f"Age range: {age_values.min():.1f} - {age_values.max():.1f}")
-            
+            logger.info(
+                f"Age range: {age_values.min():.1f} - {age_values.max():.1f}"
+            )
+
             # Handle weird age preprocessing (might be standardized/normalized)
-            if age_values.min() < 10:  # Likely standardized or processed incorrectly
-                logger.info("Detected non-standard age values, using percentile-based grouping")
+            if (
+                age_values.min() < 10
+            ):  # Likely standardized or processed incorrectly
+                logger.info(
+                    "Detected non-standard age values, using percentile-based grouping"
+                )
                 # Use percentiles to create balanced groups
                 age_percentiles = age_values.quantile([0.33, 0.67]).values
-                
+
                 age_groups = []
                 for age in age_values:
                     if age <= age_percentiles[0]:
@@ -78,20 +86,29 @@ def get_sensitive_feature(
                     if age < 35:
                         age_groups.append("young_adult")  # < 35
                     elif age < 50:
-                        age_groups.append("middle_age")   # 35-50  
+                        age_groups.append("middle_age")  # 35-50
                     else:
-                        age_groups.append("mature")       # 50+
-            
+                        age_groups.append("mature")  # 50+
+
             age_series = pd.Series(age_groups, name=f"{attr}_groups")
-            logger.info(f"Age group distribution: {age_series.value_counts().to_dict()}")
+            logger.info(
+                f"Age group distribution: {age_series.value_counts().to_dict()}"
+            )
             return age_series, f"{attr}_groups", False
         else:
             # For other numerical attributes, create quantile-based groups
             try:
-                groups = pd.qcut(test_df[col_name], q=3, duplicates='drop', labels=['low', 'medium', 'high'])
+                groups = pd.qcut(
+                    test_df[col_name],
+                    q=3,
+                    duplicates="drop",
+                    labels=["low", "medium", "high"],
+                )
                 return groups, f"{attr}_groups", False
             except:
-                logger.warning(f"Could not create groups for {col_name}, using original values")
+                logger.warning(
+                    f"Could not create groups for {col_name}, using original values"
+                )
                 return test_df[col_name], col_name, False
 
     # Case 3: Categorical - reconstruct from one-hot encoding
@@ -114,22 +131,26 @@ def get_sensitive_feature(
             [cat_values.get(i, "Unknown") for i in range(len(test_df))],
             name=attr,
         )
-        
+
         # For education, group into broader categories to prevent 0.000 DI ratios
         if attr == "NAME_EDUCATION_TYPE":
             education_groups = []
             for edu in sensitive_features:
-                if "Higher education" in str(edu) or "Academic degree" in str(edu):
+                if "Higher education" in str(edu) or "Academic degree" in str(
+                    edu
+                ):
                     education_groups.append("higher_education")
                 elif "Secondary" in str(edu) or "Incomplete" in str(edu):
                     education_groups.append("secondary_education")
                 else:
                     education_groups.append("other_education")
-            
+
             grouped_series = pd.Series(education_groups, name=f"{attr}_groups")
-            logger.info(f"Education group distribution: {grouped_series.value_counts().to_dict()}")
+            logger.info(
+                f"Education group distribution: {grouped_series.value_counts().to_dict()}"
+            )
             return grouped_series, f"{attr}_groups", False
-        
+
         return sensitive_features, attr, False
 
     # Case 4: Not found
@@ -171,9 +192,11 @@ def calculate_fairness_metrics(
     else:
         # Handle edge cases: only one group or no positive predictions
         disparate_impact_ratio = 1.0
-    
+
     # Also calculate the old difference metric for backward compatibility
-    disparity_difference = frame.difference(method="between_groups")["selection_rate"]
+    disparity_difference = frame.difference(method="between_groups")[
+        "selection_rate"
+    ]
 
     return {
         "selection_rate_by_group": selection_rates.to_dict(),
@@ -222,8 +245,10 @@ def analyze_fairness(
         # Check for adverse impact using disparate impact ratio
         # DI ratio < 0.8 indicates adverse impact per four-fifths rule
         di_ratio = metrics["disparate_impact_ratio"]
-        di_threshold = approval_thresholds.get("disparate_impact_threshold", 0.8)
-        
+        di_threshold = approval_thresholds.get(
+            "disparate_impact_threshold", 0.8
+        )
+
         if di_ratio < di_threshold:
             bias_flag = True
             logger.warning(
diff --git a/credit-scorer/src/utils/preprocess.py b/credit-scorer/src/utils/preprocess.py
@@ -20,7 +20,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 
 class DeriveAgeFeatures(BaseEstimator, TransformerMixin):
     """Create AGE_YEARS and EMPLOYMENT_YEARS from DAYS_BIRTH / DAYS_EMPLOYED.
-    
+
     Implements fairness-aware age discretization to reduce bias.
     """
 
@@ -33,42 +33,48 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         # Derive AGE_YEARS with fairness-aware binning
         if "DAYS_BIRTH" in df:
             age_years = -df["DAYS_BIRTH"] / 365.25
-            
+
             # Create balanced age bins to reduce bias
             # Use quantile-based binning instead of fixed ranges
             df["AGE_YEARS"] = age_years
-            
+
             # Add age-related features that are less biased
-            df["AGE_SQUARED"] = age_years ** 2  # Non-linear age effect
-            df["AGE_LOG"] = pd.Series(age_years).apply(lambda x: np.log(max(x, 18)))
-            
+            df["AGE_SQUARED"] = age_years**2  # Non-linear age effect
+            df["AGE_LOG"] = pd.Series(age_years).apply(
+                lambda x: np.log(max(x, 18))
+            )
+
             # Create broad age categories to reduce granular bias
             df["AGE_CATEGORY"] = pd.cut(
-                age_years, 
-                bins=[0, 35, 50, 65, 100], 
-                labels=["young", "middle", "mature", "senior"]
+                age_years,
+                bins=[0, 35, 50, 65, 100],
+                labels=["young", "middle", "mature", "senior"],
             ).astype(str)
-            
+
             df = df.drop(columns=["DAYS_BIRTH"])
 
         # Derive EMPLOYMENT_YEARS with stability indicators
         if "DAYS_EMPLOYED" in df:
             # Handle the special case of 365243 (unemployed marker)
             employment_days = df["DAYS_EMPLOYED"].copy()
-            
+
             # Replace the unemployed marker with 0
             employment_days = employment_days.replace(365243, 0)
-            
+
             df["EMPLOYMENT_YEARS"] = employment_days.apply(
                 lambda x: abs(x) / 365.25 if x < 0 else 0
             )
-            
+
             # Add employment stability features
             df["IS_EMPLOYED"] = (employment_days < 0).astype(int)
             df["EMPLOYMENT_STABILITY"] = df["EMPLOYMENT_YEARS"].apply(
-                lambda x: "stable" if x > 2 else "new" if x > 0 else "unemployed"
+                lambda x: "stable"
+                if x > 2
+                else "new"
+                if x > 0
+                else "unemployed"
             )
-            
+
             df = df.drop(columns=["DAYS_EMPLOYED"])
 
         return df
diff --git a/credit-scorer/src/utils/visualizations/__init__.py b/credit-scorer/src/utils/visualizations/__init__.py
@@ -17,12 +17,12 @@
 
 """HTML component utilities for rendering compliance dashboards."""
 
+from .dashboard import generate_compliance_dashboard_html
 from .eval import generate_eval_visualization
 from .whylogs import generate_whylogs_visualization
-from .dashboard import generate_compliance_dashboard_html
 
 __all__ = [
     "generate_eval_visualization",
     "generate_whylogs_visualization",
     "generate_compliance_dashboard_html",
-]
+]
diff --git a/credit-scorer/src/utils/visualizations/dashboard.py b/credit-scorer/src/utils/visualizations/dashboard.py
@@ -676,15 +676,6 @@ def generate_compliance_dashboard_html(
     compliance_percentage = compliance_summary.get("overall_score", 0)
     last_release_id = compliance_summary.get("release_id", "Unknown")
 
-    # Determine color based on compliance score
-    bar_color = (
-        "#D64045"
-        if compliance_percentage < 60
-        else "#FFB30F"
-        if compliance_percentage < 80
-        else "#478C5C"
-    )
-
     # Determine status text
     if compliance_percentage >= 80:
         status_text = "High Compliance"
diff --git a/credit-scorer/src/utils/visualizations/whylogs.py b/credit-scorer/src/utils/visualizations/whylogs.py
@@ -6,7 +6,6 @@
 from zenml.types import HTMLString
 
 
-
 def _format_num(val: Any, precision: int = 6) -> str:
     """Convert a numeric value to string, trim trailing zeros & dots."""
     try:
@@ -103,20 +102,28 @@ def generate_whylogs_visualization(
                 (m for m in metrics if m in ("counts/null", "counts/nan")),
                 None,
             )
-            unique_metric = next((m for m in metrics if m.startswith("cardinality/")), None)
+            unique_metric = next(
+                (m for m in metrics if m.startswith("cardinality/")), None
+            )
             min_metric = next((m for m in metrics if m.endswith("/min")), None)
             max_metric = next((m for m in metrics if m.endswith("/max")), None)
-            mean_metric = next((m for m in metrics if m.endswith("/mean")), None)
+            mean_metric = next(
+                (m for m in metrics if m.endswith("/mean")), None
+            )
 
             # Get values with error handling
             count_val = row[count_metric] if count_metric else "N/A"
             null_val = row[null_metric] if null_metric else "N/A"
 
             # Format the values below
-            unique_val = _format_num(row[unique_metric], 0) if unique_metric else "N/A"
+            unique_val = (
+                _format_num(row[unique_metric], 0) if unique_metric else "N/A"
+            )
             min_val = _format_num(row[min_metric]) if min_metric else "N/A"
             max_val = _format_num(row[max_metric]) if max_metric else "N/A"
-            mean_val = _format_num(row[mean_metric], 4) if mean_metric else "N/A"
+            mean_val = (
+                _format_num(row[mean_metric], 4) if mean_metric else "N/A"
+            )
 
             html_content += f"""
                 <tr>
@@ -143,7 +150,10 @@ def generate_whylogs_visualization(
     """
 
     # Add section about sensitive attributes if they exist
-    if "sensitive_attributes" in dataset_info and dataset_info["sensitive_attributes"]:
+    if (
+        "sensitive_attributes" in dataset_info
+        and dataset_info["sensitive_attributes"]
+    ):
         html_content += """
             <div class="alert">
                 <h3>Sensitive Attributes Detected</h3>
@@ -166,5 +176,3 @@ def generate_whylogs_visualization(
     """
 
     return HTMLString(html_content)
-
-