zenml-io · htahir1 · May 16, 2025 · May 15, 2025 · May 16, 2025 · May 16, 2025
diff --git a/.typos.toml b/.typos.toml
@@ -56,6 +56,7 @@ mape = "mape"
 yhat = "yhat"
 yhat_lower = "yhat_lower"
 yhat_upper = "yhat_upper"
+fpr = "fpr"
 
 [default]
 locale = "en-us"
diff --git a/bank_subscription_prediction/Dockerfile.codespace b/bank_subscription_prediction/Dockerfile.codespace
@@ -0,0 +1,38 @@
+# Sandbox base image
+FROM zenmldocker/zenml-sandbox:latest
+
+# Install uv from official distroless image
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+# Set uv environment variables for optimization
+ENV UV_SYSTEM_PYTHON=1
+ENV UV_COMPILE_BYTECODE=1
+
+# Project metadata
+LABEL project_name="bank_subscription_prediction"
+LABEL project_version="0.1.0"
+
+# Install dependencies with uv and cache optimization
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system \
+    "zenml[server]>=0.50.0" \
+    "notebook" \
+    "scikit-learn" \
+    "pyarrow" \
+    "pandas" \
+    "xgboost" \
+    "matplotlib" \
+    "plotly" \
+    "jupyter"
+
+# Set workspace directory
+WORKDIR /workspace
+
+# Clone only the project directory and reorganize
+RUN git clone --depth 1 https://github.com/zenml-io/zenml-projects.git /tmp/zenml-projects && \
+    cp -r /tmp/zenml-projects/bank_subscription_prediction/* /workspace/ && \
+    rm -rf /tmp/zenml-projects
+
+# VSCode settings
+RUN mkdir -p /workspace/.vscode && \
+    printf '{\n  "workbench.colorTheme": "Default Dark Modern"\n}' > /workspace/.vscode/settings.json
diff --git a/bank_subscription_prediction/README.md b/bank_subscription_prediction/README.md
@@ -0,0 +1,187 @@
+# 🏦 Bank Subscription Prediction
+
+A production-ready MLOps pipeline for predicting bank term deposit subscriptions using XGBoost.
+
+<div align="center">
+  <br/>
+    <img alt="Training Pipeline DAG" src="assets/training_dag.png" width="70%">
+  <br/>
+  <p><em>ZenML visualization of the training pipeline DAG</em></p>
+</div>
+
+## 🎯 Business Context
+
+In banking, accurate prediction of which customers are likely to subscribe to term deposits helps optimize marketing campaigns and increase conversion rates. This project provides a production-ready prediction solution that:
+
+- Predicts the likelihood of customers subscribing to term deposits
+- Handles class imbalance common in marketing datasets
+- Implements feature selection to identify key factors influencing subscriptions
+- Provides interactive visualizations of model performance
+
+## 📊 Data Overview
+
+This project uses the [Bank Marketing dataset](https://archive.ics.uci.edu/ml/datasets/bank+marketing) from the UCI Machine Learning Repository. The dataset contains:
+
+- Customer demographic information (age, job, marital status, education)
+- Financial attributes (housing, loan, balance)
+- Campaign details (contact channel, day, month, duration)
+- Previous campaign outcomes
+- Target variable: whether the client subscribed to a term deposit (yes/no)
+
+The data loader will automatically download and cache the dataset if it's not available locally. No need to manually download the data!
+
+## 🚀 Pipeline Architecture
+
+The project implements a complete ML pipeline with the following steps:
+
+1. **Data Loading**: Auto-download or load the bank marketing dataset
+2. **Data Cleaning**: Handle missing values and outliers
+3. **Data Preprocessing**: Process categorical variables, drop unnecessary columns
+4. **Data Splitting**: Split data into training and test sets
+5. **Model Training**: Train an XGBoost classifier with selected features
+6. **Model Evaluation**: Evaluate model performance and visualize results with interactive HTML visualization
+
+## 💡 Model Details
+
+This solution uses XGBoost, specifically designed to handle:
+
+- **Class Imbalance**: Targets the common problem in marketing datasets where positive responses are rare
+- **Feature Importance**: Automatically identifies and ranks the most influential factors
+- **Scalability**: Efficiently processes large customer datasets
+- **Performance**: Consistently outperforms traditional classifiers for this type of prediction task
+
+## 🛠️ Getting Started
+
+### Prerequisites
+
+- Python 3.9+
+- ZenML installed and configured
+
+### Installation
+
+```bash
+# Clone the repository
+git clone https://github.com/zenml-io/zenml-projects.git
+cd zenml-projects/bank_subscription_prediction
+
+# Install dependencies
+pip install -r requirements.txt
+
+# Initialize ZenML (if needed)
+zenml init
+```
+
+### Running the Pipeline
+
+#### Basic Usage
+
+```bash
+python run.py
+```
+
+#### Using Different Configurations
+
+```bash
+python run.py --config configs/more_trees.yaml
+```
+
+### Available Configurations
+
+| Config File | Description | Key Parameters |
+|-------------|-------------|----------------|
+| `baseline.yaml` | Default XGBoost parameters | Base estimators and depth |
+| `more_trees.yaml` | Increased number of estimators | 200 estimators |
+| `deeper_trees.yaml` | Increased maximum tree depth | Max depth of 5 |
+
+## 📁 Project Structure
+
+```
+bank_subscription_prediction/
+├── configs/             # YAML Configuration files
+│   ├── __init__.py
+│   ├── baseline.yaml    # Baseline experiment config
+│   ├── more_trees.yaml  # Config with more trees
+│   └── deeper_trees.yaml# Config with deeper trees
+├── pipelines/           # ZenML pipeline definitions
+│   ├── __init__.py
+│   └── training_pipeline.py
+├── steps/               # ZenML pipeline steps
+│   ├── __init__.py
+│   ├── data_loader.py
+│   ├── data_cleaner.py
+│   ├── data_preprocessor.py
+│   ├── data_splitter.py
+│   ├── model_trainer.py
+│   └── model_evaluator.py
+├── utils/               # Utility functions and helpers
+│   ├── __init__.py
+│   └── model_utils.py
+├── __init__.py
+├── requirements.txt     # Project dependencies
+├── README.md            # Project documentation
+└── run.py               # Main script to run the pipeline
+```
+
+## 🔧 Creating Custom Configurations
+
+You can create new YAML configuration files by copying and modifying existing ones:
+
+```yaml
+# my_custom_config.yaml
+# Start with copying an existing config and modify the values
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+      - pandas
+      - numpy
+    requirements:
+      - matplotlib
+      - xgboost
+      - plotly
+      - click
+      - pyarrow
+
+# Model Control Plane config
+model:
+  name: bank_subscription_classifier
+  version: 0.1.0
+  license: MIT
+  description: A bank term deposit subscription classifier
+  tags: ["bank_marketing", "classifier", "xgboost"]
+
+# Custom step parameters
+steps:
+  # ...other step params...
+  train_xgb_model_with_feature_selection:
+    n_estimators: 300
+    max_depth: 4
+    # ...other parameters...
+```
+
+## 📈 Example Use Case: Marketing Campaign Optimization
+
+A retail bank uses this pipeline to:
+
+1. Train models on historical marketing campaign data
+2. Identify key customer segments most likely to convert
+3. Deploy targeted campaigns to high-probability customers
+4. Achieve 35% higher conversion rates with 25% lower campaign costs
+
+## 🔄 Integration with Banking Systems
+
+This solution can be integrated with existing banking systems:
+
+- **CRM Systems**: Feed predictions into customer relationship management systems
+- **Marketing Automation**: Provide segments for targeted campaign execution
+- **BI Dashboards**: Export prediction insights to business intelligence tools
+- **Customer Service**: Prioritize high-value potential customers for follow-up
+
+## 👏 Credits
+
+This project is based on the Jupyter notebook [predict_bank_cd_subs_by_xgboost_clf_for_imbalance_dataset.ipynb](https://github.com/IBM/xgboost-financial-predictions/blob/master/notebooks/predict_bank_cd_subs_by_xgboost_clf_for_imbalance_dataset.ipynb) from IBM's xgboost-financial-predictions repository. The original work demonstrates XGBoost classification for imbalanced datasets and has been adapted into a complete ZenML pipeline.
+
+## 📄 License
+
+This project is licensed under the Apache License 2.0. 
diff --git a/bank_subscription_prediction/__init__.py b/bank_subscription_prediction/__init__.py
@@ -0,0 +1 @@
+"""Bank Subscription Prediction Project using ZenML.""" 
diff --git a/bank_subscription_prediction/assets/training_dag.png b/bank_subscription_prediction/assets/training_dag.png
diff --git a/bank_subscription_prediction/configs/baseline.yaml b/bank_subscription_prediction/configs/baseline.yaml
@@ -0,0 +1,49 @@
+# Baseline experiment configuration
+
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+      - pandas
+      - numpy
+    requirements:
+      - matplotlib
+      - xgboost
+      - plotly
+      - click
+      - pyarrow
+
+# configuration of the Model Control Plane
+model:
+  name: bank_subscription_classifier
+  version: 0.1.0
+  license: MIT
+  description: A bank term deposit subscription classifier
+  tags: ["bank_marketing", "classifier", "xgboost"]
+
+# Step-specific parameters
+steps:
+  # Data loading parameters
+  load_data:
+    csv_file_path: "bank.csv"
+
+  # Data splitting parameters
+  split_data_step:
+    test_size: 0.2
+    random_state: 42
+    stratify_col: "y"
+
+  # Model training parameters
+  train_xgb_model_with_feature_selection:
+    learning_rate: 0.1
+    n_estimators: 100
+    max_depth: 3
+    min_child_weight: 1
+    gamma: 0
+    subsample: 0.8
+    colsample_bytree: 0.8
+    objective: "binary:logistic"
+    scale_pos_weight: 1  # Will be calculated dynamically if not overridden
+    random_state: 42
+    feature_selection_threshold: "median" 
diff --git a/bank_subscription_prediction/configs/deeper_trees.yaml b/bank_subscription_prediction/configs/deeper_trees.yaml
@@ -0,0 +1,49 @@
+# Deeper trees experiment configuration
+
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+      - pandas
+      - numpy
+    requirements:
+      - matplotlib
+      - xgboost
+      - plotly
+      - click
+      - pyarrow
+
+# configuration of the Model Control Plane
+model:
+  name: bank_subscription_classifier
+  version: 0.1.0
+  license: MIT
+  description: A bank term deposit subscription classifier
+  tags: ["bank_marketing", "classifier", "xgboost"]
+
+# Step-specific parameters
+steps:
+  # Data loading parameters
+  load_data:
+    csv_file_path: "bank.csv"
+
+  # Data splitting parameters
+  split_data_step:
+    test_size: 0.2
+    random_state: 42
+    stratify_col: "y"
+
+  # Model training parameters with deeper trees
+  train_xgb_model_with_feature_selection:
+    learning_rate: 0.1
+    n_estimators: 100
+    max_depth: 5  # Deeper trees than baseline
+    min_child_weight: 1
+    gamma: 0
+    subsample: 0.8
+    colsample_bytree: 0.8
+    objective: "binary:logistic"
+    scale_pos_weight: 1
+    random_state: 42
+    feature_selection_threshold: "median" 
diff --git a/bank_subscription_prediction/configs/more_trees.yaml b/bank_subscription_prediction/configs/more_trees.yaml
@@ -0,0 +1,49 @@
+# More trees experiment configuration
+
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+      - pandas
+      - numpy
+    requirements:
+      - matplotlib
+      - xgboost
+      - plotly
+      - click
+      - pyarrow
+
+# configuration of the Model Control Plane
+model:
+  name: bank_subscription_classifier
+  version: 0.1.0
+  license: MIT
+  description: A bank term deposit subscription classifier
+  tags: ["bank_marketing", "classifier", "xgboost"]
+
+# Step-specific parameters
+steps:
+  # Data loading parameters
+  load_data:
+    csv_file_path: "bank.csv"
+
+  # Data splitting parameters
+  split_data_step:
+    test_size: 0.2
+    random_state: 42
+    stratify_col: "y"
+
+  # Model training parameters with more trees
+  train_xgb_model_with_feature_selection:
+    learning_rate: 0.1
+    n_estimators: 200  # More trees than baseline
+    max_depth: 3
+    min_child_weight: 1
+    gamma: 0
+    subsample: 0.8
+    colsample_bytree: 0.8
+    objective: "binary:logistic"
+    scale_pos_weight: 1
+    random_state: 42
+    feature_selection_threshold: "median"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Bank Subscription Prediction Project using ZenML."""