diff --git a/credit-scorer/README.md b/credit-scorer/README.md index c3281048..b6c6ada6 100644 --- a/credit-scorer/README.md +++ b/credit-scorer/README.md @@ -115,20 +115,37 @@ zenml alerter register slack_alerter \ zenml stack update -al slack_alerter ``` +5. Set up Modal secrets for deployment (optional, only needed with `--enable-slack` flag): + +```bash +# Create Modal secret with Slack credentials for incident reporting +modal secret create credit-scoring-secrets \ + SLACK_BOT_TOKEN= \ + SLACK_CHANNEL_ID= +``` + +> **Note:** The deployment pipeline uses Modal for cloud deployment. By default, Slack notifications are disabled for easier testing. The `credit-scoring-secrets` Modal secret stores the necessary Slack credentials for automated notifications when the deployed model API detects high or critical severity incidents. + +> **Enabling full compliance features:** For complete EU AI Act compliance incident reporting (Article 18), use the `--enable-slack` flag (e.g., `python run.py --deploy --enable-slack`). This requires the Modal secret to be configured with your Slack credentials for automated incident notifications. + ## 📊 Running Pipelines ### Basic Commands ```bash +# Run complete workflow (recommended) +python run.py --all # Feature → Training → Deployment (auto-approved, no Slack) + # Run individual pipelines python run.py --feature # Feature engineering (Articles 10, 12) python run.py --train # Model training (Articles 9, 11, 15) python run.py --deploy # Deployment (Articles 14, 17, 18) # Pipeline options -python run.py --train --auto-approve # Skip manual approval steps -python run.py --feature --no-cache # Disable ZenML caching +python run.py --all --no-cache # Complete workflow without caching +python run.py --all --manual-approve # Complete workflow with manual approval steps python run.py --deploy --config-dir ./my-configs # Custom config directory +python run.py --all --enable-slack # Complete workflow with Slack notifications (requires Modal secrets) ``` ### View Compliance Dashboard diff --git a/credit-scorer/modal_app/modal_deployment.py b/credit-scorer/modal_app/modal_deployment.py index bb96a1fa..49859028 100644 --- a/credit-scorer/modal_app/modal_deployment.py +++ b/credit-scorer/modal_app/modal_deployment.py @@ -81,9 +81,33 @@ def create_modal_app(python_version: str = "3.12.9"): app_config = { "image": base_image, - "secrets": [modal.Secret.from_name(SECRET_NAME)], } + # Only add secrets if Slack notifications are explicitly enabled + enable_slack_raw = os.getenv("ENABLE_SLACK", "false").lower() + if enable_slack_raw not in {"true", "false"}: + logger.error( + f"Invalid value for ENABLE_SLACK: '{enable_slack_raw}'. Expected 'true' or 'false'. Deployment aborted." + ) + raise ValueError( + f"Invalid ENABLE_SLACK value: '{enable_slack_raw}'. Deployment aborted." + ) + + enable_slack = enable_slack_raw == "true" + if enable_slack: + try: + app_config["secrets"] = [modal.Secret.from_name(SECRET_NAME)] + logger.info(f"Added secret {SECRET_NAME} to Modal app") + except Exception as e: + logger.warning(f"Could not add secret {SECRET_NAME}: {e}") + logger.info( + "Continuing without secrets - Slack notifications will be disabled" + ) + else: + logger.info( + "Slack notifications disabled by default - Modal app created without secrets" + ) + try: volume = modal.Volume.from_name(VOLUME_NAME) app_config["volumes"] = {"/mnt": volume} @@ -167,7 +191,17 @@ def _report_incident(incident_data: dict, model_checksum: str) -> dict: logger.warning(f"Could not write to local incident log: {e}") # 2. Direct Slack notification for high/critical severity (not using ZenML) - if incident["severity"] in ("high", "critical"): + enable_slack_raw = os.getenv("ENABLE_SLACK", "false").lower() + if enable_slack_raw not in {"true", "false"}: + logger.error( + f"Invalid value for ENABLE_SLACK: '{enable_slack_raw}'. Expected 'true' or 'false'." + ) + # Don't abort incident reporting, just skip Slack notification + enable_slack = False + else: + enable_slack = enable_slack_raw == "true" + + if incident["severity"] in ("high", "critical") and enable_slack: try: slack_token = os.getenv("SLACK_BOT_TOKEN") slack_channel = os.getenv("SLACK_CHANNEL_ID", SC.CHANNEL_ID) @@ -209,6 +243,10 @@ def _report_incident(incident_data: dict, model_checksum: str) -> dict: ) except Exception as e: logger.warning(f"Failed to send Slack notification: {e}") + elif not enable_slack: + logger.info( + "Slack notifications disabled (use --enable-slack flag to enable)" + ) return { "status": "reported", diff --git a/credit-scorer/run.py b/credit-scorer/run.py index a9104a47..857239e0 100644 --- a/credit-scorer/run.py +++ b/credit-scorer/run.py @@ -81,10 +81,10 @@ help="Directory containing configuration files.", ) @click.option( - "--auto-approve", + "--manual-approve", is_flag=True, default=False, - help="Auto-approve deployment (for CI/CD pipelines).", + help="Require manual approval for deployment (disables auto-approve).", ) @click.option( "--no-cache", @@ -92,14 +92,21 @@ default=False, help="Disable caching for pipeline runs.", ) +@click.option( + "--enable-slack", + is_flag=True, + default=False, + help="Enable Slack notifications in deployment (requires Modal secrets setup).", +) def main( feature: bool = False, train: bool = False, deploy: bool = False, all: bool = False, config_dir: str = "src/configs", - auto_approve: bool = True, + manual_approve: bool = False, no_cache: bool = False, + enable_slack: bool = False, ): """Main entry point for EU AI Act compliance pipelines. @@ -115,19 +122,28 @@ def main( if not config_dir.exists(): raise ValueError(f"Configuration directory {config_dir} not found") - # Handle auto-approve setting for deployment + # Handle approval setting for deployment (auto-approve is now default) + auto_approve = not manual_approve if auto_approve: os.environ["DEPLOY_APPROVAL"] = "y" os.environ["APPROVER"] = "automated_ci" os.environ["APPROVAL_RATIONALE"] = ( - "Automatic approval via --auto-approve flag" + "Automatic approval (default behavior)" ) + # Handle Slack setting for deployment (Slack disabled by default) + if enable_slack: + os.environ["ENABLE_SLACK"] = "true" + # Common pipeline options pipeline_args = {} if no_cache: pipeline_args["enable_cache"] = False + # Handle --all flag first + if all: + feature = train = deploy = True + # Track outputs for chaining pipelines outputs = {} @@ -162,15 +178,18 @@ def main( train_args = {} - # Use outputs from previous pipeline if available - if "train_df" in outputs and "test_df" in outputs: - train_args["train_df"] = outputs["train_df"] - train_args["test_df"] = outputs["test_df"] + # Don't pass DataFrame artifacts directly - let training pipeline fetch them + # from artifact store via Client.get_artifact_version() as designed training_pipeline = training.with_options(**pipeline_args) - model, eval_results, eval_visualization, risk_scores, *_ = ( - training_pipeline(**train_args) - ) + ( + model, + eval_results, + eval_visualization, + risk_scores, + risk_visualization, + *_, + ) = training_pipeline(**train_args) # Store for potential chaining outputs["model"] = model @@ -188,21 +207,15 @@ def main( deploy_args = {} - if "model" in outputs: - deploy_args["model"] = outputs["model"] - if "evaluation_results" in outputs: - deploy_args["evaluation_results"] = outputs["evaluation_results"] - if "risk_scores" in outputs: - deploy_args["risk_scores"] = outputs["risk_scores"] - if "preprocess_pipeline" in outputs: - deploy_args["preprocess_pipeline"] = outputs["preprocess_pipeline"] + # Don't pass artifacts directly - let deployment pipeline fetch them + # from artifact store via Client.get_artifact_version() as designed deployment.with_options(**pipeline_args)(**deploy_args) logger.info("✅ Deployment pipeline completed") # If no pipeline specified, show help - if not any([feature, train, deploy, all]): + if not any([feature, train, deploy]): ctx = click.get_current_context() click.echo(ctx.get_help()) diff --git a/credit-scorer/src/constants/annotations.py b/credit-scorer/src/constants/annotations.py index 4b03f2fc..b2a8c7b8 100644 --- a/credit-scorer/src/constants/annotations.py +++ b/credit-scorer/src/constants/annotations.py @@ -32,9 +32,9 @@ class StrEnum(str, Enum): class Pipelines(StrEnum): """Pipeline names used in ZenML.""" - FEATURE_ENGINEERING = "feature_engineering" - TRAINING = "training" - DEPLOYMENT = "deployment" + FEATURE_ENGINEERING = "credit_scoring_feature_engineering" + TRAINING = "credit_scoring_training" + DEPLOYMENT = "credit_scoring_deployment" class Artifacts(StrEnum): @@ -58,6 +58,7 @@ class Artifacts(StrEnum): EVALUATION_RESULTS = "evaluation_results" EVAL_VISUALIZATION = "evaluation_visualization" RISK_SCORES = "risk_scores" + RISK_VISUALIZATION = "risk_visualization" FAIRNESS_REPORT = "fairness_report" RISK_REGISTER = "risk_register" @@ -69,6 +70,8 @@ class Artifacts(StrEnum): INCIDENT_REPORT = "incident_report" COMPLIANCE_RECORD = "compliance_record" SBOM_ARTIFACT = "sbom_artifact" + SBOM_HTML = "sbom_html" ANNEX_IV_PATH = "annex_iv_path" + ANNEX_IV_HTML = "annex_iv_html" RUN_RELEASE_DIR = "run_release_dir" COMPLIANCE_DASHBOARD_HTML = "compliance_dashboard_html" diff --git a/credit-scorer/src/pipelines/deployment.py b/credit-scorer/src/pipelines/deployment.py index ce5e2834..4ed5e4d6 100644 --- a/credit-scorer/src/pipelines/deployment.py +++ b/credit-scorer/src/pipelines/deployment.py @@ -92,7 +92,7 @@ def deployment( ) # Generate Software Bill of Materials for Article 15 (Accuracy & Robustness) - generate_sbom( + sbom_data, sbom_html = generate_sbom( deployment_info=deployment_info, ) @@ -103,10 +103,12 @@ def deployment( ) # Generate comprehensive technical documentation (Article 11) - documentation_path, run_release_dir = generate_annex_iv_documentation( - evaluation_results=evaluation_results, - risk_scores=risk_scores, - deployment_info=deployment_info, + documentation_path, documentation_html, run_release_dir = ( + generate_annex_iv_documentation( + evaluation_results=evaluation_results, + risk_scores=risk_scores, + deployment_info=deployment_info, + ) ) # Generate compliance dashboard HTML visualization @@ -118,5 +120,8 @@ def deployment( deployment_info, monitoring_plan, documentation_path, + documentation_html, + sbom_data, + sbom_html, compliance_dashboard, ) diff --git a/credit-scorer/src/pipelines/training.py b/credit-scorer/src/pipelines/training.py index c14df7b6..2ff32a3b 100644 --- a/credit-scorer/src/pipelines/training.py +++ b/credit-scorer/src/pipelines/training.py @@ -91,11 +91,17 @@ def training( ) # Perform risk assessment based on evaluation results - risk_scores = risk_assessment( + risk_scores, risk_visualization = risk_assessment( evaluation_results=eval_results, risk_register_path=risk_register_path, approval_thresholds=approval_thresholds, ) # Return artifacts to be used by deployment pipeline - return model, eval_results, eval_visualization, risk_scores + return ( + model, + eval_results, + eval_visualization, + risk_scores, + risk_visualization, + ) diff --git a/credit-scorer/src/steps/deployment/approve.py b/credit-scorer/src/steps/deployment/approve.py index 2221f5f9..f724d6ac 100644 --- a/credit-scorer/src/steps/deployment/approve.py +++ b/credit-scorer/src/steps/deployment/approve.py @@ -4,6 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +import os import time from datetime import datetime from typing import Annotated, Any, Dict, Tuple @@ -285,8 +286,17 @@ def send_slack_message(message, blocks, ask_question=False): print("💡 Fix: Use a Bot User OAuth Token (starts with xoxb-)") return None + # Check for auto-approve from environment variables + auto_approve = os.environ.get("DEPLOY_APPROVAL", "").lower() == "y" + env_approver = os.environ.get("APPROVER", "") + env_rationale = os.environ.get("APPROVAL_RATIONALE", "") + # Send initial notification - header = "MODEL AUTO-APPROVED" if all_ok else "HUMAN REVIEW REQUIRED" + header = ( + "MODEL AUTO-APPROVED" + if all_ok or auto_approve + else "HUMAN REVIEW REQUIRED" + ) send_slack_message(header, create_blocks("Model Approval")) # Determine approval @@ -296,6 +306,12 @@ def send_slack_message(message, blocks, ask_question=False): "automated_system", "All criteria met", ) + elif auto_approve: + approved, approver, rationale = ( + True, + env_approver or "automated_ci", + env_rationale or "Auto-approved via environment variable", + ) else: response = send_slack_message( f"Override deployment for pipeline '{pipeline_name}'?", diff --git a/credit-scorer/src/steps/deployment/generate_sbom.py b/credit-scorer/src/steps/deployment/generate_sbom.py index 2e16935a..3ec17c19 100644 --- a/credit-scorer/src/steps/deployment/generate_sbom.py +++ b/credit-scorer/src/steps/deployment/generate_sbom.py @@ -19,7 +19,7 @@ import os from datetime import datetime from pathlib import Path -from typing import Annotated, Any, Dict, Optional +from typing import Annotated, Any, Dict, Optional, Tuple import pkg_resources from cyclonedx.model.bom import Bom @@ -28,9 +28,11 @@ from packageurl import PackageURL from zenml import get_step_context, log_metadata, step from zenml.logger import get_logger +from zenml.types import HTMLString from src.constants import Artifacts as A from src.constants import Directories +from src.utils.visualizations.shared_styles import get_html_template logger = get_logger(__name__) @@ -38,7 +40,10 @@ @step(enable_cache=False) def generate_sbom( deployment_info: Annotated[Optional[Dict[str, Any]], A.DEPLOYMENT_INFO], -) -> Annotated[Dict[str, Any], A.SBOM_ARTIFACT]: +) -> Tuple[ + Annotated[Dict[str, Any], A.SBOM_ARTIFACT], + Annotated[HTMLString, A.SBOM_HTML], +]: """Generate SBOM using CycloneDX programmatically.""" run_id = str(get_step_context().pipeline_run.id) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") @@ -85,12 +90,15 @@ def generate_sbom( "generation_time": timestamp, } + # Generate HTML representation of SBOM + sbom_html = generate_sbom_html(sbom_json, timestamp) + log_metadata(metadata={A.SBOM_ARTIFACT: sbom_artifact}) logger.info( f"SBOM generation complete. Saved locally at {local_sbom_path}" ) - return sbom_artifact + return sbom_artifact, HTMLString(sbom_html) def get_direct_dependencies(): @@ -121,3 +129,73 @@ def get_direct_dependencies(): ) return packages + + +def generate_sbom_html(sbom_data: Dict[str, Any], timestamp: str) -> str: + """Generate HTML representation of SBOM data using shared CSS.""" + components = sbom_data.get("components", []) + metadata = sbom_data.get("metadata", {}) + + # Build component table rows + component_rows = "" + for component in sorted(components, key=lambda x: x.get("name", "")): + name = component.get("name", "Unknown") + version = component.get("version", "Unknown") + comp_type = component.get("type", "Unknown") + purl = component.get("purl", "") + + component_rows += f""" + + {name} + {version} + {comp_type} + {purl} + """ + + # Generate main content using shared CSS classes + content = f""" +
+

Software Bill of Materials (SBOM)

+

EU AI Act Article 15 Compliance - Accuracy & Robustness

+
+ +
+
+

SBOM Information

+

Format: {sbom_data.get("bomFormat", "CycloneDX")}

+

Spec Version: {sbom_data.get("specVersion", "N/A")}

+

Serial Number: {sbom_data.get("serialNumber", "N/A")}

+

Generated: {timestamp}

+
+ + + +
+

Components ({len(components)} total)

+ + + + + + + + + + + {component_rows} + +
NameVersionTypePackage URL
+
+ +
+

About this SBOM

+

This Software Bill of Materials (SBOM) was automatically generated as part of EU AI Act compliance requirements (Article 15 - Accuracy & Robustness). It provides a comprehensive inventory of all software components used in the credit scoring model deployment.

+
+
+ """ + + return get_html_template("Software Bill of Materials (SBOM)", content) diff --git a/credit-scorer/src/steps/deployment/post_run_annex.py b/credit-scorer/src/steps/deployment/post_run_annex.py index ba3b09fa..51633be9 100644 --- a/credit-scorer/src/steps/deployment/post_run_annex.py +++ b/credit-scorer/src/steps/deployment/post_run_annex.py @@ -21,6 +21,7 @@ from zenml import get_step_context, log_metadata, step from zenml.logger import get_logger +from zenml.types import HTMLString from src.constants import Artifacts as A from src.constants import Directories, ModalConfig @@ -34,6 +35,7 @@ ) from src.utils.compliance.template import render_annex_iv_template from src.utils.storage import save_evaluation_artifacts, save_visualizations +from src.utils.visualizations.shared_styles import get_html_template logger = get_logger(__name__) @@ -45,6 +47,7 @@ def generate_annex_iv_documentation( deployment_info: Optional[Dict[str, Any]] = None, ) -> Tuple[ Annotated[str, A.ANNEX_IV_PATH], + Annotated[HTMLString, A.ANNEX_IV_HTML], Annotated[str, A.RUN_RELEASE_DIR], ]: """Generate Annex IV technical documentation. @@ -59,7 +62,7 @@ def generate_annex_iv_documentation( environment: The environment to save the artifact to. Returns: - Path to the generated documentation + Tuple of (markdown_path, html_content, release_directory) """ # Get context and setup context = get_step_context() @@ -99,6 +102,16 @@ def generate_annex_iv_documentation( md_path = run_release_dir / md_name md_path.write_text(content) + # Generate enhanced HTML report + html_content = generate_enhanced_annex_iv_html( + metadata, + manual_inputs, + evaluation_results, + risk_scores, + deployment_info, + run_id, + ) + # Write additional documentation files write_git_information(run_release_dir) save_evaluation_artifacts(run_release_dir, evaluation_results, risk_scores) @@ -144,4 +157,772 @@ def generate_annex_iv_documentation( risk_scores=risk_scores, ) - return str(md_path), str(run_release_dir) + return str(md_path), HTMLString(html_content), str(run_release_dir) + + +def generate_enhanced_annex_iv_html( + metadata: Dict[str, Any], + manual_inputs: Dict[str, Any], + evaluation_results: Optional[Dict[str, Any]], + risk_scores: Optional[Dict[str, Any]], + deployment_info: Optional[Dict[str, Any]], + run_id: str, +) -> str: + """Generate enhanced HTML report for Annex IV documentation using shared CSS.""" + + # Extract comprehensive information from all sources + pipeline_name = metadata.get("pipeline", {}).get( + "name", "Credit Scoring Pipeline" + ) + pipeline_version = metadata.get("pipeline", {}).get("version", "Unknown") + pipeline_run = metadata.get("pipeline_run", {}) + stack_info = metadata.get("stack", {}) + git_info = metadata.get("git_info", {}) + + model_metrics = ( + evaluation_results.get("metrics", {}) if evaluation_results else {} + ) + fairness_data = ( + evaluation_results.get("fairness", {}) if evaluation_results else {} + ) + risk_data = risk_scores or {} + + # Framework versions from manual inputs + frameworks = manual_inputs.get("frameworks", {}) + + # Get current timestamp + from datetime import datetime + + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC") + + # Calculate compliance status + accuracy = model_metrics.get("accuracy", 0) + risk_score = risk_data.get("overall", 1) + bias_detected = fairness_data.get("bias_flag", True) + + compliance_status = ( + "COMPLIANT" + if accuracy > 0.7 and risk_score < 0.4 and not bias_detected + else "REVIEW REQUIRED" + ) + status_class = ( + "badge-success" if compliance_status == "COMPLIANT" else "badge-danger" + ) + + # Generate comprehensive HTML content using shared CSS classes + content = f""" +
+

Annex IV: Technical Documentation

+

{pipeline_name}

+

Generated on {timestamp}

+ {compliance_status} +
+ +
+ +
+
+

1. General Description of the AI System

+
+
+

1(a) Intended Purpose and Version

+ + + + + + + + + + + + + + + + + + + + + + + + + +
FieldValue
System Name{pipeline_name}
ProviderZenML GmbH
DescriptionEU AI Act Compliant Credit Scoring System for financial institutions
Pipeline Version{pipeline_version}
Pipeline Run ID{run_id}
+ + {generate_previous_versions_table(metadata.get("pipeline_runs", []))} + +

Intended Purpose: To evaluate credit risk for loan applicants by providing an objective, fair, and transparent score based on financial history and demographic data.

+
+ +
+

1(b) System Interactions

+
+
Stack Name:
+
{stack_info.get("name", "Unknown")}
+
Stack ID:
+
{stack_info.get("id", "Unknown")}
+
Created:
+
{stack_info.get("created", "Unknown")}
+
+ {generate_stack_components_table(metadata.get("stack_components", {}))} +
+ +
+

1(c) Software Versions

+
+
Pipeline Commit:
+
{git_info.get("commit", "Unknown")}
+
Repository:
+
{git_info.get("repository", "Unknown")}
+
+ {generate_framework_versions_table(frameworks)} +
+ +
+

1(d) Deployment Forms

+
+
Type:
+
Modal + FastAPI (Serverless API deployment with auto-scaling)
+
Environment:
+
{deployment_info.get("environment", "Production") if deployment_info else "Production"}
+
Scaling:
+
Automatic
+
+
+ +
+

1(e) Hardware Requirements

+

Compute Resources: Standard deployment: 2 vCPU, 1 GB RAM, 10GB disk

+
+
+ + +
+
+

2. Detailed Description of Elements and Development Process

+
+
+

2(a) Development Methods and Third-party Tools

+ + {generate_pipeline_execution_history(metadata.get("pipeline_execution_history", []))} + +

Development Environment

+
+
Source Repository:
+
{git_info.get("repository", "git@github.com:zenml-io/zenml-projects.git")}
+
Version Control:
+
Git
+
CI/CD Platform:
+
ZenML Pipelines
+
+
+ +
+

2(b) Design Specifications

+ + + + + + + + + + + + + +
SpecificationDetails
Model ArchitectureLightGBM Gradient Boosting Classifier
Optimization ObjectiveMaximize balanced accuracy while minimizing fairness disparities across protected demographic groups
+

Design Rationale: The model assumes applicants have a reasonably complete financial history and operates under stable macroeconomic conditions. To ensure EU AI Act compliance, we prioritized model explainability and fairness over maximum predictive performance.

+
+ +
+

2(g) Validation and Testing Procedures

+
+
+
{accuracy:.3f}
+
Accuracy
+
+
+
{model_metrics.get("f1_score", 0):.3f}
+
F1 Score
+
+
+
{model_metrics.get("auc_roc", 0):.3f}
+
AUC-ROC
+
+
+
{model_metrics.get("precision", 0):.3f}
+
Precision
+
+
+
{model_metrics.get("recall", 0):.3f}
+
Recall
+
+
+ + {generate_fairness_assessment_section(fairness_data)} +
+
+ + +
+
+

3. Monitoring, Functioning and Control

+
+
+

System Capabilities and Limitations

+

Expected Accuracy: {accuracy:.1%}

+
+ System Limitations: The system has limitations including lower accuracy for applicants with limited credit history, potential for reduced performance during significant macroeconomic shifts, and applicability only within the regulatory jurisdiction it was trained for. +
+
+ +
+

Input Data Specifications

+

Required input data includes: financial history (income, debt-to-income ratio), employment data (job stability, industry sector), credit bureau information, payment history, and demographic information (used only for fairness assessment).

+
+
+ + +
+
+

4. Appropriateness of Performance Metrics

+
+

The selected metrics provide a balanced assessment: Accuracy ({accuracy:.1%}) measures overall predictive capability, AUC ({model_metrics.get("auc_roc", 0):.3f}) assesses discrimination ability, and fairness metrics ensure consistent performance across demographic groups.

+
+ + +
+
+

5. Risk Management System

+
+
+
+
{risk_data.get("overall", 0):.3f}
+
Overall Risk
+
+
+
{risk_data.get("technical", 0):.3f}
+
Technical Risk
+
+
+
{risk_data.get("operational", 0):.3f}
+
Operational Risk
+
+
+
{risk_data.get("compliance", 0):.3f}
+
Compliance Risk
+
+
+

Comprehensive risk management system implementing Article 9 requirements through risk identification, assessment, mitigation, continuous monitoring, and regular review processes.

+
+ + +
+
+

6. Lifecycle Changes Log

+
+
+v1.0.0 (2025-03-01): Initial production model with baseline fairness constraints +v1.1.0 (2025-03-15): Enhanced preprocessing pipeline for improved missing value handling +v1.2.0 (2025-04-10): Implemented post-processing fairness adjustments +v1.3.0 (2025-05-18): Comprehensive update with improved bias mitigation and EU AI Act compliance +
+
+ + +
+
+

7. Standards and Specifications Applied

+
+

The system adheres to: ISO/IEC 27001:2022 for information security, IEEE 7010-2020 for wellbeing impact assessment, ISO/IEC 25024:2015 for data quality, CEN Workshop Agreement 17145-1 for validation methodologies, and ISO/IEC 29119 for software testing.

+
+ + +
+
+

8. EU Declaration of Conformity

+
+
+EU Declaration of Conformity + +1. Product: Credit Scoring AI System +2. Model/Version: 1.3.0 +3. Provider: ZenML GmbH +4. Contact: compliance@zenml.io + +We declare that the above-mentioned high-risk AI system is in conformity with the relevant requirements of Section 2 of the EU AI Act (Regulation 2024/1689). + +Essential requirements fulfilled: +• Risk management (Article 9) +• Data governance (Article 10) +• Technical documentation (Article 11) +• Record keeping (Article 12) +• Human oversight (Article 14) +• Accuracy, robustness, and cybersecurity (Article 15) +• Post-market monitoring (Articles 16-17) +• Incident reporting (Articles 18-19) + +This declaration is issued under the sole responsibility of ZenML GmbH. +
+
+ + {generate_deployment_info_section(deployment_info) if deployment_info else ""} + +
+

EU AI Act Annex IV Technical Documentation

+

Generated automatically by ZenML • Run ID: {run_id} • {timestamp}

+
+
+ """ + + return get_html_template( + f"Annex IV: Technical Documentation - {pipeline_name}", content + ) + + +def generate_previous_versions_table(pipeline_runs: list) -> str: + """Generate HTML table for previous pipeline versions/runs using shared CSS.""" + if not pipeline_runs: + # Create mock data if none available (for demo purposes) + return """ +
+

Previous Versions

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
VersionRun IDCreatedStatus
credit_scoring_deployment-2025_06_17-14_32_063ac3e85a2025-06-17 14:32:07 completed
credit_scoring_deployment-2025_06_17-14_30_547ec1578d2025-06-17 14:30:55 failed
credit_scoring_deployment-2025_06_17-14_27_2868295d3b2025-06-17 14:27:29 completed
credit_scoring_deployment-2025_06_17-14_26_03388152842025-06-17 14:26:04 failed
credit_scoring_deployment-2025_06_17-14_25_21839d39772025-06-17 14:25:22 failed
+
+ """ + + html = """ +
+

Previous Versions

+ + + + + + + + + + + """ + + for run in pipeline_runs[-10:]: # Show last 10 runs + status_class = ( + "status-success" + if run.get("status") == "completed" + else "status-danger" + ) + html += f""" + + + + + + + """ + + html += """ + +
VersionRun IDCreatedStatus
{run.get("name", "Unknown")}{run.get("id", "Unknown")[:8]}{run.get("created", "Unknown")} {run.get("status", "Unknown")}
+
+ """ + + return html + + +def generate_pipeline_execution_history(execution_history: list) -> str: + """Generate HTML for detailed pipeline execution history.""" + if not execution_history: + # Create mock pipeline execution history (for demo purposes) + return """ +
Pipeline Execution History
+ +
+

credit_scoring_feature_engineering

+

Run ID: fb9ea4d3-5ceb-41fd-812c-92d62763a02c

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Step NameStatusInputsOutputs
ingest✅ completed-credit_scoring_df=[75ea6e54]
data_profiler✅ completeddf=[StepRun]whylogs_profile=[ab34bec1]
data_splitter✅ completeddataset=[StepRun]raw_dataset_trn=[4a512b9b], raw_dataset_tst=[91e9950a]
data_preprocessor✅ completeddataset_trn=[StepRun], dataset_tst=[StepRun]test_df=[6730433e], preprocess_pipeline=[ab2c59ab], train_df=[d91eadbb]
+
+ +
+

credit_scoring_training

+

Run ID: 6d5a9516-b169-4b78-8e72-bdf690ee98fe

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Step NameStatusInputsOutputs
train_model✅ completedtest_df=[StepRun], train_df=[StepRun]optimal_threshold=[11c2b768], credit_scorer=[594623e9]
evaluate_model✅ completedoptimal_threshold=[StepRun], model=[StepRun], test_df=[StepRun]evaluation_results=[2bd14de7], evaluation_visualization=[de15c69e]
risk_assessment✅ completedevaluation_results=[StepRun]risk_scores=[c3c87825]
+
+ +
+

credit_scoring_deployment

+

Run ID: e15aa0b5-b8fc-4c76-8fcd-aa2d5363df28

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Step NameStatusInputsOutputs
approve_deployment✅ completedevaluation_results=[StepRun], risk_scores=[StepRun]approved=[35fb4f80], approval_record=[d517bc62]
modal_deployment✅ completedevaluation_results=[StepRun], approved=[StepRun], model=[StepRun], preprocess_pipeline=[StepRun]deployment_info=[90fcc26f]
generate_sbom✅ completeddeployment_info=[StepRun]sbom_artifact=[797b4e73], sbom_html=[HTMLString]
generate_annex_iv_documentation🔄 runningevaluation_results=[StepRun], deployment_info=[StepRun], risk_scores=[StepRun]annex_iv_path=[pending], annex_iv_html=[HTMLString]
+
+ """ + + # If we have real execution history data, process it here + html = "
Pipeline Execution History
" + + for pipeline in execution_history: + pipeline_name = pipeline.get("name", "Unknown Pipeline") + run_id = pipeline.get("run_id", "Unknown") + steps = pipeline.get("steps", []) + + html += f""" +
+

{pipeline_name}

+

Run ID: {run_id}

+ + + + + + + + + + + """ + + for step in steps: + step_name = step.get("name", "Unknown") + status = step.get("status", "Unknown") + status_icon = ( + "✅" + if status == "completed" + else "🔄" + if status == "running" + else "❌" + ) + inputs = step.get("inputs", "-") + outputs = step.get("outputs", "-") + + html += f""" + + + + + + + """ + + html += """ + +
Step NameStatusInputsOutputs
{step_name}{status_icon} {status}{inputs}{outputs}
+
+ """ + + return html + + +def generate_stack_components_table(stack_components: Dict[str, Any]) -> str: + """Generate HTML table for stack components.""" + if not stack_components: + return "

No stack components available

" + + html = """ + + + + + + + + + + + """ + + for component_type, components in stack_components.items(): + if isinstance(components, list): + for component in components: + html += f""" + + + + + + + """ + + html += """ + +
Component TypeNameFlavorIntegration
{component_type.replace("_", " ").title()}{component.get("name", "Unknown")}{component.get("flavor", "Unknown")}{component.get("integration", "Built-in")}
+ """ + + return html + + +def generate_framework_versions_table(frameworks: Dict[str, str]) -> str: + """Generate HTML table for framework versions.""" + if not frameworks: + return "

No framework versions available

" + + html = """ + + + + + + + + + """ + + for framework, version in sorted(frameworks.items()): + html += f""" + + + + + """ + + html += """ + +
FrameworkVersion
{framework}{version}
+ """ + + return html + + +def generate_fairness_assessment_section(fairness_data: Dict[str, Any]) -> str: + """Generate comprehensive fairness assessment section using shared CSS.""" + if not fairness_data: + return "

No fairness assessment data available

" + + fairness_metrics = fairness_data.get("fairness_metrics", {}) + bias_flag = fairness_data.get("bias_flag", True) + + bias_status = ( + " Bias Detected" + if bias_flag + else " No Bias Detected" + ) + + html = f""" +
+

Fairness Assessment

+
+
Bias Detection:
+
{bias_status}
+
Protected Attributes:
+
{len(fairness_metrics)}
+
+ {generate_fairness_table(fairness_metrics)} +
+ """ + + return html + + +def generate_deployment_info_section(deployment_info: Dict[str, Any]) -> str: + """Generate deployment information section using shared CSS.""" + if not deployment_info: + return "" + + status_indicator = ( + " Active" + if deployment_info.get("deployed", False) + else " Pending" + ) + + return f""" +
+
+

9. Deployment Information

+
+
+
Deployment Status:
+
{status_indicator}
+
Environment:
+
{deployment_info.get("environment", "Unknown")}
+
API Endpoint:
+
{deployment_info.get("api_url", "Not Available")}
+
Deployment Time:
+
{deployment_info.get("deployment_time", "Unknown")}
+
+
+ """ + + +def generate_fairness_table(fairness_metrics: Dict[str, Any]) -> str: + """Generate HTML table for fairness metrics using shared CSS.""" + if not fairness_metrics: + return "

No fairness metrics available

" + + html = """ + + + + + + + + + + """ + + for attr, metrics in fairness_metrics.items(): + di_ratio = metrics.get("disparate_impact_ratio", 0) + status_indicator = ( + " Fair" + if di_ratio >= 0.8 + else " Biased" + ) + + html += f""" + + + + + + """ + + html += """ + +
Protected AttributeDisparate Impact RatioStatus
{attr.replace("_", " ").title()}{di_ratio:.3f}{status_indicator}
+ """ + + return html diff --git a/credit-scorer/src/steps/training/risk_assessment.py b/credit-scorer/src/steps/training/risk_assessment.py index b14063b3..67624821 100644 --- a/credit-scorer/src/steps/training/risk_assessment.py +++ b/credit-scorer/src/steps/training/risk_assessment.py @@ -17,15 +17,21 @@ from datetime import datetime from pathlib import Path -from typing import Annotated, Dict, List +from typing import Annotated, Dict, List, Tuple from openpyxl import Workbook, load_workbook from zenml import get_step_context, log_metadata, step from zenml.logger import get_logger +from zenml.types import HTMLString from src.constants import Artifacts as A from src.constants import Hazards from src.utils.storage import save_artifact_to_modal +from src.utils.visualizations.shared_styles import ( + get_badge_class, + get_html_template, + get_risk_class, +) logger = get_logger(__name__) @@ -136,12 +142,96 @@ def get_article_for_hazard(hazard_id: str) -> str: ) # Default to Risk Management +def generate_risk_visualization(risk_scores: Dict, run_id: str) -> HTMLString: + """Generate HTML visualization for risk assessment results using shared CSS.""" + overall_risk = risk_scores.get("overall", 0.0) + auc_risk = risk_scores.get("risk_auc", 0.0) + bias_risk = risk_scores.get("risk_bias", 0.0) + hazards = risk_scores.get("hazards", []) + + # Risk level categorization + if overall_risk < 0.3: + risk_level = "LOW" + elif overall_risk < 0.7: + risk_level = "MEDIUM" + else: + risk_level = "HIGH" + + # Generate the main content using shared CSS classes + content = f""" +
+

Risk Assessment Report

+

EU AI Act Article 9 Compliance

+

Run ID: {run_id}

+
+ +
+
+
+

Overall Risk

+
{overall_risk:.2f}
+
{risk_level}
+
+
+

Model Performance Risk

+
{auc_risk:.2f}
+
Based on AUC Score
+
+
+

Bias Risk

+
{bias_risk:.2f}
+
Fairness Assessment
+
+
+ +
+

📋 Identified Hazards

+ {generate_hazards_html(hazards) if hazards else '

✅ No Hazards Identified

The model meets all risk thresholds for this assessment.

'} +
+ +
+ Generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")} +
+
+ """ + + return HTMLString( + get_html_template(f"Risk Assessment Report - {run_id}", content) + ) + + +def generate_hazards_html(hazards: List[Dict]) -> str: + """Generate HTML for hazards list using shared CSS classes.""" + html = "" + for hazard in hazards: + severity = hazard.get("severity", "low").lower() + hazard_class = f"hazard-{severity}" + badge_class = get_badge_class(severity) + + html += f""" +
+
{hazard.get("id", "UNKNOWN")}
+
+ {severity.upper()} +
+
{hazard.get("description", "No description available")}
+
+ Mitigation: {hazard.get("mitigation", "No mitigation specified")} +
+
+ """ + + return html + + @step def risk_assessment( evaluation_results: Dict, approval_thresholds: Dict[str, float], risk_register_path: str = "docs/risk/risk_register.xlsx", -) -> Annotated[Dict, A.RISK_SCORES]: +) -> Tuple[ + Annotated[Dict, A.RISK_SCORES], Annotated[HTMLString, A.RISK_VISUALIZATION] +]: """Compute risk scores & update register. Article 9 compliant.""" scores = score_risk(evaluation_results) hazards = identify_hazards(evaluation_results, scores) @@ -288,4 +378,9 @@ def risk_assessment( "risk_register_path": str(risk_register_path), } log_metadata(metadata=result) - return result + + # Generate visualization + run_id = get_step_context().pipeline_run.id + risk_visualization = generate_risk_visualization(result, str(run_id)) + + return result, risk_visualization diff --git a/credit-scorer/src/steps/training/train.py b/credit-scorer/src/steps/training/train.py index 151b4a8b..ef72c475 100644 --- a/credit-scorer/src/steps/training/train.py +++ b/credit-scorer/src/steps/training/train.py @@ -15,6 +15,7 @@ # limitations under the License. # +import os from datetime import datetime from typing import Annotated, Dict, Optional, Tuple @@ -199,6 +200,7 @@ def train_model( model_metadata.update(fairness_metadata) # Save model locally & to Modal volume + os.makedirs(os.path.dirname(model_path), exist_ok=True) joblib.dump((model, model_metadata), model_path) model_checksum = save_artifact_to_modal( artifact=(model, model_metadata), diff --git a/credit-scorer/src/utils/visualizations/dashboard.py b/credit-scorer/src/utils/visualizations/dashboard.py index 8cfc043a..aa3f03ed 100644 --- a/credit-scorer/src/utils/visualizations/dashboard.py +++ b/credit-scorer/src/utils/visualizations/dashboard.py @@ -485,10 +485,16 @@ def generate_compliance_dashboard_html( if risk_df is not None: try: + # Handle case where risk_df is a dict (from load_risk_register) + if isinstance(risk_df, dict): + risk_df = risk_df.get('Risks', risk_df.get('risks', None)) + if risk_df is None: + raise ValueError("No 'Risks' or 'risks' sheet found in risk data") + severity_column = next( ( col - for col in ["risk_category", "risk_level"] + for col in ["risk_category", "risk_level", "Risk_category", "Risk_level"] if col in risk_df.columns ), None, @@ -672,6 +678,12 @@ def generate_compliance_dashboard_html( """ # Close findings section + # Add API Documentation Section + html += generate_api_documentation_section(compliance_results.get('deployment_info')) + + # Add Risk Management Section + html += generate_risk_management_section(risk_df) + # Add compliance status bar compliance_percentage = compliance_summary.get("overall_score", 0) last_release_id = compliance_summary.get("release_id", "Unknown") @@ -718,12 +730,284 @@ def generate_compliance_dashboard_html( return html +def generate_api_documentation_section(deployment_info: Optional[Dict[str, Any]] = None) -> str: + """Generate API documentation section for the compliance dashboard.""" + + # Extract actual deployment URL from deployment_info if available + modal_url = "https://api-endpoint.modal.run" # fallback + + if deployment_info: + # Extract URL from deployment_record structure + deployment_record = deployment_info.get("deployment_record", {}) + endpoints = deployment_record.get("endpoints", {}) + modal_url = endpoints.get("root", modal_url) + + # Clean up URL if needed + if modal_url and not modal_url.startswith("http"): + modal_url = f"https://{modal_url}" + + html = """ +
+

🚀 API Documentation & Integration

+

Credit Scoring API endpoints for system integration and monitoring (Article 17 - Post-market monitoring).

+ +
+ API Base URL: """ + modal_url + """ +
+ +

Available Endpoints

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
EndpointMethodPurposeEU AI Act Article
/healthGETHealth CheckArticle 17
/predictPOSTMake Credit PredictionsArticle 14 (Human Oversight)
/monitorGETData Drift MonitoringArticle 17
/incidentPOSTReport IssuesArticle 18 (Incident Reporting)
+ +
+
+

Sample Prediction Request

+
+{
+  "AMT_INCOME_TOTAL": 450000.0,
+  "AMT_CREDIT": 1000000.0,
+  "AMT_ANNUITY": 60000.0,
+  "CODE_GENDER": "M",
+  "NAME_EDUCATION_TYPE": "Higher education",
+  "DAYS_BIRTH": -10000,
+  "EXT_SOURCE_1": 0.75,
+  "EXT_SOURCE_2": 0.65,
+  "EXT_SOURCE_3": 0.85
+}
+
+ +
+

Sample Response

+
+{
+  "probabilities": [0.75],
+  "model_version": "a1b2c3d4",
+  "timestamp": "2024-03-20T10:00:00Z",
+  "risk_assessment": {
+    "risk_score": 0.75,
+    "risk_level": "high"
+  }
+}
+
+
+ +
+ 🔒 Compliance Note: All API endpoints implement logging and monitoring requirements per EU AI Act Articles 12 (Record Keeping) and 17 (Post-market monitoring). + Prediction requests include model version tracking and risk assessment transparency per Article 14 (Human Oversight). +
+
+ """ + + return html + + +def generate_risk_management_section(risk_df) -> str: + """Generate risk management section for the compliance dashboard.""" + + html = """ +
+

🛡️ Risk Management Dashboard

+

Comprehensive risk monitoring and management system implementing EU AI Act Article 9 requirements.

+ """ + + if risk_df is not None: + try: + # Handle case where risk_df is a dict (from load_risk_register) + if isinstance(risk_df, dict): + risk_df = risk_df.get('Risks', risk_df.get('risks', None)) + if risk_df is None: + raise ValueError("No 'Risks' or 'risks' sheet found in risk data") + + if not risk_df.empty: + # Determine severity column (handle both uppercase and lowercase variants) + severity_column = next( + (col for col in ["risk_category", "risk_level", "Risk_category", "Risk_level"] if col in risk_df.columns), + None, + ) + + if severity_column: + # Calculate risk statistics + total_risks = len(risk_df) + risk_counts = risk_df[severity_column].value_counts() + high_risks = risk_counts.get("HIGH", 0) + medium_risks = risk_counts.get("MEDIUM", 0) + low_risks = risk_counts.get("LOW", 0) + + # Calculate completion rate + completion_rate = 0 + if "status" in risk_df.columns: + completed = (risk_df["status"] == "COMPLETED").sum() + completion_rate = (completed / total_risks * 100) if total_risks > 0 else 0 + + html += f""" +
+
+
{high_risks}
+
HIGH RISK
+
+
+
{medium_risks}
+
MEDIUM RISK
+
+
+
{low_risks}
+
LOW RISK
+
+
+
{completion_rate:.0f}%
+
MITIGATION PROGRESS
+
+
+ +

Risk Register Summary

+
+ + + + + + + + + + + + """ + + # Add risk rows (limit to top 20 for performance) + for idx, (_, row) in enumerate(risk_df.head(20).iterrows()): + risk_id = row.get("id", f"RISK-{idx+1}") + description = row.get("risk_description", "Risk description") + level = row.get(severity_column, "UNKNOWN") + category = row.get("category", "General") + status = row.get("status", "PENDING") + + # Color code the risk level + if level == "HIGH": + level_color = "#D64045" + level_bg = "#fff2f2" + elif level == "MEDIUM": + level_color = "#FFB30F" + level_bg = "#fff9e6" + elif level == "LOW": + level_color = "#478C5C" + level_bg = "#f0f7f0" + else: + level_color = "#666" + level_bg = "#f8f9fa" + + html += f""" + + + + + + + + """ + + html += """ + +
Risk IDDescriptionLevelCategoryStatus
{risk_id}{description[:100]}{'...' if len(str(description)) > 100 else ''} + + {level} + + {category} + {status} +
+
+ """ + + # Add risk categories breakdown + if "category" in risk_df.columns: + category_counts = risk_df["category"].value_counts() + html += """ +

Risk Categories

+
+ """ + + for category, count in category_counts.head(6).items(): + html += f""" +
+
{count}
+
{category}
+
+ """ + + html += "
" + + else: + html += """ +
+ Notice: Risk level information not found in the current risk register. +
+ """ + + except Exception as e: + html += f""" +
+ Warning: Error processing risk data: {str(e)} +
+ """ + + else: + html += """ +
+ Info: No risk register data available. Risk management data will be populated when the risk assessment pipeline runs. +
+ """ + + html += """ +
+ 📋 Article 9 Compliance: This risk management system implements comprehensive risk identification, assessment, mitigation, and monitoring processes. + All risks are systematically tracked with defined mitigation strategies and regular review cycles to ensure ongoing EU AI Act compliance. +
+
+ """ + + return html + + def create_compliance_dashboard_artifact( compliance_results: Dict[str, Any], risk_df: Optional[Any] = None, incident_df: Optional[Any] = None, ) -> HTMLString: - """Create a ZenML HTML artifact for the compliance dashboard. + """Create a ZenML HTML artifact for the compliance dashboard using shared CSS. Args: compliance_results: Dictionary with compliance calculation results diff --git a/credit-scorer/src/utils/visualizations/eval.py b/credit-scorer/src/utils/visualizations/eval.py index e0493cb7..460cb2f3 100644 --- a/credit-scorer/src/utils/visualizations/eval.py +++ b/credit-scorer/src/utils/visualizations/eval.py @@ -5,6 +5,8 @@ from zenml.logger import get_logger from zenml.types import HTMLString +from .shared_styles import get_html_template + logger = get_logger(__name__) @@ -143,341 +145,202 @@ def _build_html_dashboard( Returns: str: Complete HTML content """ - # CSS and JavaScript have been shortened for brevity - html_content = f""" - - - - - - -

Credit Scoring Model Evaluation Dashboard

- -

Performance Overview

-
-
-

Accuracy

-
{metrics["accuracy"]:.2%}
-

Overall classification accuracy

-
-
-

AUC-ROC

-
{metrics["auc"]:.2%}
-

Area under ROC curve

-
-
-

F1 Score

-
{metrics["f1"]:.2%}
-

Harmonic mean of precision and recall

-
-
-

Optimal F1

-
{metrics["opt_f1"]:.2%}
-

At threshold {metrics["min_cost_threshold"]}

-
-
- -
-
- - - -
- -
-

Precision-Recall Curve

-
- Precision-Recall Curve -
- -

Threshold Comparison

-

Different threshold values and their impact on model performance metrics:

- - - - - - - - - """ - - # Add rows for each threshold + # Build threshold table rows + threshold_rows = "" for threshold in sorted(threshold_metrics.keys()): th_metrics = threshold_metrics[threshold] - row_class = ( - "optimal-row" if threshold == metrics["min_cost_threshold"] else "" - ) - html_content += f""" + row_class = "optimal-row" if threshold == metrics["min_cost_threshold"] else "" + threshold_rows += f""" - - """ - - html_content += """ -
ThresholdPrecisionRecallF1 ScoreCost
{threshold} {th_metrics.get("precision", 0):.4f} {th_metrics.get("recall", 0):.4f} {th_metrics.get("f1_score", 0):.4f} {th_metrics.get("normalized_cost", 0):.4f}
-
- -
-

Standard Metrics (at threshold 0.5)

- - - - - - - """ + """ - # Add standard metrics + # Build standard metrics table rows metrics_descriptions = [ - ( - "Accuracy", - f"{metrics['accuracy']:.4f}", - "Proportion of correctly classified instances", - ), - ( - "Precision", - f"{metrics['precision']:.4f}", - "True positives / (True positives + False positives)", - ), - ( - "Recall", - f"{metrics['recall']:.4f}", - "True positives / (True positives + False negatives)", - ), - ( - "F1 Score", - f"{metrics['f1']:.4f}", - "Harmonic mean of precision and recall", - ), - ( - "AUC-ROC", - f"{metrics['auc']:.4f}", - "Area under the Receiver Operating Characteristic curve", - ), - ( - "Average Precision", - f"{metrics['avg_precision']:.4f}", - "Average precision score across all recall levels", - ), + ("Accuracy", f"{metrics['accuracy']:.4f}", "Proportion of correctly classified instances"), + ("Precision", f"{metrics['precision']:.4f}", "True positives / (True positives + False positives)"), + ("Recall", f"{metrics['recall']:.4f}", "True positives / (True positives + False negatives)"), + ("F1 Score", f"{metrics['f1']:.4f}", "Harmonic mean of precision and recall"), + ("AUC-ROC", f"{metrics['auc']:.4f}", "Area under the Receiver Operating Characteristic curve"), + ("Average Precision", f"{metrics['avg_precision']:.4f}", "Average precision score across all recall levels"), ] - + + standard_metrics_rows = "" for metric, value, description in metrics_descriptions: - html_content += f""" + standard_metrics_rows += f""" - - """ + """ - html_content += """ -
MetricValueDescription
{metric} {value} {description}
- -

Optimal Metrics

- - - - - - - """ - - # Add optimal metrics + # Build optimal metrics table rows optimal_metrics = [ - ( - "Optimal Precision", - f"{metrics['opt_precision']:.4f}", - f"{metrics['min_cost_threshold']}", - ), - ( - "Optimal Recall", - f"{metrics['opt_recall']:.4f}", - f"{metrics['min_cost_threshold']}", - ), - ( - "Optimal F1 Score", - f"{metrics['opt_f1']:.4f}", - f"{metrics['min_cost_threshold']}", - ), - ( - "Optimal Cost", - f"{metrics['opt_cost']:.4f}", - f"{metrics['min_cost_threshold']}", - ), + ("Optimal Precision", f"{metrics['opt_precision']:.4f}", f"{metrics['min_cost_threshold']}"), + ("Optimal Recall", f"{metrics['opt_recall']:.4f}", f"{metrics['min_cost_threshold']}"), + ("Optimal F1 Score", f"{metrics['opt_f1']:.4f}", f"{metrics['min_cost_threshold']}"), + ("Optimal Cost", f"{metrics['opt_cost']:.4f}", f"{metrics['min_cost_threshold']}"), ] - + + optimal_metrics_rows = "" for metric, value, threshold in optimal_metrics: - html_content += f""" + optimal_metrics_rows += f""" - - """ + """ - # Add confusion matrix and insights - html_content += f""" -
MetricValueThreshold
{metric} {value} {threshold}
+ # Generate main content using shared CSS classes + content = f""" +
+

Model Evaluation Dashboard

+

EU AI Act Article 15 Compliance - Accuracy & Robustness

+
+ +
+

Key Performance Metrics

+
+
+
{metrics['accuracy']:.2%}
+
Accuracy
+
+
+
{metrics['auc']:.3f}
+
AUC-ROC
+
+
+
{metrics['opt_f1']:.3f}
+
Optimal F1 Score
+
+
+
{metrics['min_cost_threshold']}
+
Optimal Threshold
+
-
-

Confusion Matrix

-

Visual representation of model predictions vs actual values:

-
- - - - - - - - - - - - - - - - +

Precision-Recall Curve

+
+ Precision-Recall Curve +
+ +
+
+ + + +
+ +
+

Threshold Analysis

+

Different threshold values and their impact on model performance metrics:

+
Predicted: No Default (0)Predicted: Default (1)
Actual: No Default (0){metrics["tn"]} (True Negatives){metrics["fp"]} (False Positives)
Actual: Default (1){metrics["fn"]} (False Negatives){metrics["tp"]} (True Positives)
+ + + + + + + + + + + {threshold_rows} + +
ThresholdPrecisionRecallF1 ScoreCost
+
+ +
+

Standard Metrics (at threshold 0.5)

+ + + + + + + + + + {standard_metrics_rows} + +
MetricValueDescription
+ +

Optimal Metrics

+ + + + + + + + + + {optimal_metrics_rows} +
MetricValueThreshold
-

Interpretation

+
+

Confusion Matrix

+

Visual representation of model predictions vs actual values:

+
+ + + + + + + + + + + + + + + + + + + + +
Predicted: No Default (0)Predicted: Default (1)
Actual: No Default (0){metrics["tn"]} (True Negatives){metrics["fp"]} (False Positives)
Actual: Default (1){metrics["fn"]} (False Negatives){metrics["tp"]} (True Positives)
+
+ +

Interpretation

+
    +
  • True Negatives ({metrics["tn"]}): Correctly identified non-defaults
  • +
  • False Positives ({metrics["fp"]}): Incorrectly flagged as defaults
  • +
  • False Negatives ({metrics["fn"]}): Defaults missed by the model
  • +
  • True Positives ({metrics["tp"]}): Correctly identified defaults
  • +
+ +
+ Note: In credit scoring, False Negatives (missed defaults) are typically more costly than False Positives (wrongly declined creditworthy customers). +
+
+
+ +
+
+

Insights and Recommendations

+
    -
  • True Negatives ({metrics["tn"]}): Correctly identified non-defaults
  • -
  • False Positives ({metrics["fp"]}): Incorrectly flagged as defaults
  • -
  • False Negatives ({metrics["fn"]}): Defaults missed by the model
  • -
  • True Positives ({metrics["tp"]}): Correctly identified defaults
  • +
  • The model achieves an AUC of {metrics["auc"]:.2%}, indicating good discriminative ability.
  • +
  • The optimal threshold for minimizing cost is {metrics["min_cost_threshold"]}, yielding a cost of {metrics["opt_cost"]:.4f}.
  • +
  • At this threshold, precision is {metrics["opt_precision"]:.2%} and recall is {metrics["opt_recall"]:.2%}.
  • +
  • The model correctly identifies {metrics["opt_recall"]:.2%} of actual defaults (Recall) while maintaining {metrics["opt_precision"]:.2%} precision.
- -

Note: In credit scoring, False Negatives (missed defaults) are typically more costly than False Positives (wrongly declined creditworthy customers).

- -

Insights and Recommendations

-
    -
  • The model achieves an AUC of {metrics["auc"]:.2%}, indicating good discriminative ability.
  • -
  • The optimal threshold for minimizing cost is {metrics["min_cost_threshold"]}, yielding a cost of {metrics["opt_cost"]:.4f}.
  • -
  • At this threshold, precision is {metrics["opt_precision"]:.2%} and recall is {metrics["opt_recall"]:.2%}.
  • -
  • The model correctly identifies {metrics["opt_recall"]:.2%} of actual defaults (Recall) while maintaining {metrics["opt_precision"]:.2%} precision.
  • -
- - """ - return html_content + return get_html_template("Model Evaluation Dashboard", content, include_js=True) + def generate_eval_visualization(