diff --git a/airflow-cloud-composer-etl-feature-train/steps/training/model_trainer.py b/airflow-cloud-composer-etl-feature-train/steps/training/model_trainer.py index c8c04bd8..d34b6eac 100644 --- a/airflow-cloud-composer-etl-feature-train/steps/training/model_trainer.py +++ b/airflow-cloud-composer-etl-feature-train/steps/training/model_trainer.py @@ -21,6 +21,7 @@ from materializers import BigQueryDataset, CSVDataset from typing_extensions import Annotated from zenml import ArtifactConfig, step +from zenml.enums import ArtifactType from zenml.logger import get_logger logger = get_logger(__name__) @@ -31,7 +32,7 @@ def train_xgboost_model( dataset: Union[BigQueryDataset, CSVDataset], ) -> Tuple[ Annotated[ - xgb.Booster, ArtifactConfig(name="xgb_model", is_model_artifact=True) + xgb.Booster, ArtifactConfig(name="xgb_model", artifact_type=ArtifactType.MODEL) ], Annotated[Dict[str, float], "metrics"], ]: diff --git a/customer-satisfaction/steps/train_model.py b/customer-satisfaction/steps/train_model.py index 98f7b4c0..c5a7a884 100644 --- a/customer-satisfaction/steps/train_model.py +++ b/customer-satisfaction/steps/train_model.py @@ -6,6 +6,7 @@ from model.model_dev import ModelTrainer from sklearn.base import RegressorMixin from zenml import ArtifactConfig, step +from zenml.enums import ArtifactType from zenml.client import Client experiment_tracker = Client().active_stack.experiment_tracker @@ -21,7 +22,7 @@ def train_model( do_fine_tuning: bool = True, ) -> Annotated[ RegressorMixin, - ArtifactConfig(name="sklearn_regressor", is_model_artifact=True), + ArtifactConfig(name="sklearn_regressor", artifact_type=ArtifactType.MODEL), ]: """ Args: diff --git a/databricks-demo/README.md b/databricks-demo/README.md deleted file mode 100644 index d490ae21..00000000 --- a/databricks-demo/README.md +++ /dev/null @@ -1,156 +0,0 @@ -# ZenML E2E project - -This is a comprehensive supervised ML project built with the -ZenML framework and its integration. The project trains one or more -scikit-learn classification models to make predictions on the tabular -classification datasets provided by the scikit-learn library. The project was -generated from the [E2E Batch ZenML project template](https://github.com/zenml-io/template-e2e-batch) -with the following properties: -- Project name: ZenML E2E project -- Technical Name: production_line_quality_assurance -- Version: `0.0.1` -- Licensed with apache to ZenML GmbH -- Deployment environment: `staging` - -Settings of your project are: -- Hyperparameters and model architecture tuning using configuration from `config.py` -- Trained model promotion to `staging` based on accuracy metric vs currently deployed model -- Data drift checks based on Evidently report -- Notifications about failures enabled - -> [!NOTE] -> If you are coming from [our production guide](https://docs.zenml.io/user-guide/production-guide), -> you can apply the same principles that you have seen in the guide to this project. -> You can [connect it to remote storage](https://docs.zenml.io/user-guide/production-guide/remote-storage), -> [run it with a cloud orchestrator](https://docs.zenml.io/user-guide/production-guide/cloud-orchestration), -> [attach a git repository to it](https://docs.zenml.io/user-guide/production-guide/connect-code-repository), -> and much more. If you are looking to try these steps with a simpler example, -> feel free to take a look at [our starter template](https://github.com/zenml-io/template-starter) -> as well. - -## 👋 Introduction - -Welcome to your newly generated "ZenML E2E project" project! This is -a great way to get hands-on with ZenML using production-like template. -The project contains a collection of standard and custom ZenML steps, -pipelines and other artifacts and useful resources that can serve as a -solid starting point for your smooth journey with ZenML. - -What to do first? You can start by giving the project a quick run. The -project is ready to be used and can run as-is without any further code -changes! You can try it right away by installing ZenML, the needed -ZenML integration and then calling the CLI included in the project. We also -recommend that you start the ZenML UI locally to get a better sense of what -is going on under the hood: - -```bash -# Set up a Python virtual environment, if you haven't already -python3 -m venv .venv -source .venv/bin/activate -# Install requirements & integrations -make setup -# Optionally, provision default local stack -make install-stack-local -# Start the ZenML UI locally (recommended, but optional) -zenml login --local -# Run the pipeline included in the project -python run.py -``` - -When the pipelines are done running, you can check out the results in the ZenML -UI by following the link printed in the terminal (or you can go straight to -the [ZenML UI pipelines run page](http://127.0.0.1:8237/workspaces/default/all-runs?page=1). - -Next, you should: - -* look at the CLI help to see what you can do with the project: -```bash -python run.py --help -``` -* go back and [try out different parameters](https://github.com/zenml-io/template-e2e-batch#-template-parameters) -for your generated project. For example, you could disable hyperparameters -tuning and use your favorite model architecture or promote every trained model, -if you haven't already! -* take a look at [the project structure](#-project-structure) and the code -itself. The code is heavily commented and should be easy to follow. -* read the [ZenML documentation](https://docs.zenml.io) to learn more about -various ZenML concepts referenced in the code and to get a better sense of -what you can do with ZenML. -* start building your own ZenML project by modifying this code - -## 📦 What's in the box? - -The ZenML E2E project project demonstrates how the most important steps of -the ML Production Lifecycle can be implemented in a reusable way remaining -agnostic to the underlying infrastructure, and how to integrate them together -into pipelines serving Training and Batch Inference purposes. - -This template uses -[the Breast Cancer Dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html) -to demonstrate how to perform major critical steps for Continuous Training (CT) -and Continuous Delivery (CD). - -It consists of three pipelines with the following high-level setup: -

- -

- -All pipelines are leveraging the Model Control Plane to bring all parts together - the training pipeline creates and promotes a new Model Control Plane version with a trained model object in it, deployment pipeline uses the inference Model Control Plane version (the one promoted during training) to create a deployment service and inference pipeline using deployment service from the inference Model Control Plane version and store back new set of predictions as a versioned data artifact for future use. This makes those pipelines closely connected while ensuring that only quality-assured Model Control Plane versions are used to produce predictions delivered to stakeholders. -* [CT] Training - * Load, split, and preprocess the training dataset - * Search for an optimal model object architecture and tune its hyperparameters - * Train the model object and evaluate its performance on the holdout set - * Compare a recently trained model object with one promoted earlier - * If a recently trained model object performs better - stage it as a new inference model object in model registry - * On success of the current model object - stage newly created Model Control Plane version as the one used for inference -* [CD] Deployment - * Deploy a new prediction service based on the model object connected to the inference Model Control Plane version. -* [CD] Batch Inference - * Load the inference dataset and preprocess it reusing object fitted during training - * Perform data drift analysis reusing training dataset of the inference Model Control Plane version as a reference - * Run predictions using a model object from the inference Model Control Plane version - * Store predictions as an versioned artifact and link it to the inference Model Control Plane version - -In [the repository documentation](https://github.com/zenml-io/template-e2e-batch#-how-this-template-is-implemented), -you can find more details about every step of this template. - -The project code is meant to be used as a template for your projects. For -this reason, you will find several places in the code specifically marked -to indicate where you can add your code: - -```python -### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ### -... -### YOUR CODE ENDS HERE ### -``` - -## 📜 Project Structure - -The project loosely follows [the recommended ZenML project structure](https://docs.zenml.io/how-to/setting-up-a-project-repository/best-practices): - -``` -. -├── configs # pipelines configuration files -│ ├── deployer_config.yaml # the configuration of the deployment pipeline -│ ├── inference_config.yaml # the configuration of the batch inference pipeline -│ └── train_config.yaml # the configuration of the training pipeline -├── pipelines # `zenml.pipeline` implementations -│ ├── batch_inference.py # [CD] Batch Inference pipeline -│ ├── deployment.py # [CD] Deployment pipeline -│ └── training.py # [CT] Training Pipeline -├── steps # logically grouped `zenml.steps` implementations -│ ├── alerts # alert developer on pipeline status -│ ├── deployment # deploy trained model objects -│ ├── data_quality # quality gates built on top of drift report -│ ├── etl # ETL logic for dataset -│ ├── hp_tuning # tune hyperparameters and model architectures -│ ├── inference # inference on top of the model from the registry -│ ├── promotion # find if a newly trained model will be new inference -│ └── training # train and evaluate model -├── utils # helper functions -├── .dockerignore -├── Makefile # helper scripts for quick start with integrations -├── README.md # this file -├── requirements.txt # extra Python dependencies -└── run.py # CLI tool to run pipelines on ZenML Stack -``` diff --git a/databricks-demo/.assets/00_pipelines_composition.png b/databricks-production-qa-demo/.assets/00_pipelines_composition.png similarity index 100% rename from databricks-demo/.assets/00_pipelines_composition.png rename to databricks-production-qa-demo/.assets/00_pipelines_composition.png diff --git a/databricks-demo/.copier-answers.yml b/databricks-production-qa-demo/.copier-answers.yml similarity index 100% rename from databricks-demo/.copier-answers.yml rename to databricks-production-qa-demo/.copier-answers.yml diff --git a/databricks-demo/.dockerignore b/databricks-production-qa-demo/.dockerignore similarity index 100% rename from databricks-demo/.dockerignore rename to databricks-production-qa-demo/.dockerignore diff --git a/databricks-demo/LICENSE b/databricks-production-qa-demo/LICENSE similarity index 100% rename from databricks-demo/LICENSE rename to databricks-production-qa-demo/LICENSE diff --git a/databricks-demo/Makefile b/databricks-production-qa-demo/Makefile similarity index 100% rename from databricks-demo/Makefile rename to databricks-production-qa-demo/Makefile diff --git a/databricks-production-qa-demo/README.md b/databricks-production-qa-demo/README.md new file mode 100644 index 00000000..0ab3b54a --- /dev/null +++ b/databricks-production-qa-demo/README.md @@ -0,0 +1,154 @@ +# Databricks + ZenML: End-to-End Explainable ML Project + +Welcome to this end-to-end demo project that showcases how to train, deploy, and run batch inference on a machine learning model using ZenML in a Databricks environment. This setup demonstrates how ZenML can simplify the end-to-end process of building reproducible, production-grade ML pipelines with minimal fuss. + +## Overview + +This project uses an example classification dataset (Breast Cancer) and provides three major pipelines: + +1. Training Pipeline +2. Deployment Pipeline +3. Batch Inference Pipeline (with SHAP-based model explainability) + +The pipelines are orchestrated via ZenML. Additionally, this setup uses: +- Databricks as the orchestrator +- MLflow for experiment tracking and model registry +- Evidently for data drift detection +- SHAP for model explainability during inference +- Slack notifications (configurable through ZenML's alerter stack components) + +## Why ZenML? + +ZenML is a lightweight MLOps framework for reproducible pipelines. With ZenML, you get: + +- A consistent, standardized way to develop, version, and share pipelines. +- Easy integration with various cloud providers, experiment trackers, model registries, and more. +- Reproducibility and better collaboration: your pipelines and associated artifacts are automatically tracked and versioned. +- Simple command-line interface for spinning pipelines up and down with different stack components (like local or Databricks orchestrators). +- Built-in best practices for production ML, including quality gates for data drift and model performance thresholds. + +## Project Structure + +Here's an outline of the repository: + +``` +. +├── configs # Pipeline configuration files +│ ├── deployer_config.yaml # Deployment pipeline config +│ ├── inference_config.yaml # Batch inference pipeline config +│ └── train_config.yaml # Training pipeline config +├── pipelines # ZenML pipeline definitions +│ ├── batch_inference.py # Orchestrates batch inference +│ ├── deployment.py # Deploys a model service +│ └── training.py # Trains and promotes model +├── steps # ZenML steps logic +│ ├── alerts # Alert/notification logic +│ ├── data_quality # Data drift and quality checks +│ ├── deployment # Deployment step +│ ├── etl # ETL steps (data loading, preprocessing, splitting) +│ ├── explainability # SHAP-based model explanations +│ ├── hp_tuning # Hyperparameter tuning pipeline steps +│ ├── inference # Batch inference step +│ ├── promotion # Model promotion logic +│ └── training # Model training and evaluation steps +├── utils # Helper modules +├── Makefile # Quick integration setup commands +├── requirements.txt # Python dependencies +├── run.py # CLI to run pipelines +└── README.md # This file +``` + +## Getting Started + +1. (Optional) Create and activate a Python virtual environment: + ```bash + python3 -m venv .venv + source .venv/bin/activate + ``` +2. Install dependencies: + ```bash + make setup + ``` + This installs the required ZenML integrations (MLflow, Slack, Evidently, Kubeflow, Kubernetes, AWS, etc.) and any library dependencies. + +3. (Optional) Set up a local Stack (if you want to try this outside Databricks): + ```bash + make install-stack-local + ``` + +4. If you have Databricks properly configured in your ZenML stack (with the Databricks token secret set up, cluster name, etc.), you can orchestrate the pipelines on Databricks by default. + +## Running the Project + +All pipeline runs happen via the CLI in run.py. Here are the main options: + +• View available options: + ```bash + python run.py --help + ``` + +• Run everything (train, deploy, inference) with default settings: + ```bash + python run.py --training --deployment --inference + ``` + This will: + 1. Train a model and evaluate its performance + 2. Deploy the model if it meets quality criteria + 3. Run batch inference with SHAP explanations and data drift checks + +• Run just the training pipeline (to build or update a model): + ```bash + python run.py --training + ``` + +• Run just the deployment pipeline (to deploy the latest staged model): + ```bash + python run.py --deployment + ``` + +• Run just the batch inference pipeline (to generate predictions and explanations while checking for data drift): + ```bash + python run.py --inference + ``` + +### Additional Command-Line Flags + +• Disable caching: + ```bash + python run.py --no-cache --training + ``` + +• Skip dropping NA values or skipping normalization: + ```bash + python run.py --no-drop-na --no-normalize --training + ``` + +• Drop specific columns: + ```bash + python run.py --training --drop-columns columnA,columnB + ``` + +• Set minimal accuracy thresholds for training and test sets: + ```bash + python run.py --min-train-accuracy 0.9 --min-test-accuracy 0.8 --fail-on-accuracy-quality-gates --training + ``` + +When you run any of these commands, ZenML will orchestrate each pipeline on the active stack (Databricks if configured) and log the results in your model registry (MLflow). If you have Slack or other alerter components configured, you'll see success/failure notifications. + +## Observing Your Pipelines + +ZenML offers a local dashboard that you can launch with: +```bash +zenml up +``` +Check the terminal logs for the local web address (usually http://127.0.0.1:8237). You'll see pipeline runs, steps, and artifacts. + +If you deployed on Databricks, you can also see the runs orchestrated in the Databricks jobs UI. The project is flexible enough to run the same pipelines locally or in the cloud without changing the code. + +## Contributing & License + +Contributions and suggestions are welcome. This project is licensed under the Apache License 2.0. + +For questions, feedback, or support, please reach out to the ZenML community or open an issue in this repository. + +--- diff --git a/databricks-demo/configs/deployer_config.yaml b/databricks-production-qa-demo/configs/deployer_config.yaml similarity index 100% rename from databricks-demo/configs/deployer_config.yaml rename to databricks-production-qa-demo/configs/deployer_config.yaml diff --git a/databricks-demo/configs/inference_config.yaml b/databricks-production-qa-demo/configs/inference_config.yaml similarity index 100% rename from databricks-demo/configs/inference_config.yaml rename to databricks-production-qa-demo/configs/inference_config.yaml diff --git a/databricks-demo/configs/train_config.yaml b/databricks-production-qa-demo/configs/train_config.yaml similarity index 98% rename from databricks-demo/configs/train_config.yaml rename to databricks-production-qa-demo/configs/train_config.yaml index 843abd5f..e8941c6e 100644 --- a/databricks-demo/configs/train_config.yaml +++ b/databricks-production-qa-demo/configs/train_config.yaml @@ -24,6 +24,7 @@ settings: - mlflow - sklearn - databricks + python_package_installer: "uv" orchestrator.databricks: cluster_name: adas_ node_type_id: Standard_D8ads_v5 diff --git a/databricks-demo/pipelines/__init__.py b/databricks-production-qa-demo/pipelines/__init__.py similarity index 100% rename from databricks-demo/pipelines/__init__.py rename to databricks-production-qa-demo/pipelines/__init__.py diff --git a/databricks-demo/pipelines/batch_inference.py b/databricks-production-qa-demo/pipelines/batch_inference.py similarity index 94% rename from databricks-demo/pipelines/batch_inference.py rename to databricks-production-qa-demo/pipelines/batch_inference.py index dacb5605..e2b32d20 100644 --- a/databricks-demo/pipelines/batch_inference.py +++ b/databricks-production-qa-demo/pipelines/batch_inference.py @@ -29,6 +29,8 @@ from zenml.integrations.evidently.steps import evidently_report_step from zenml.logger import get_logger +from steps.explainability import explain_model + logger = get_logger(__name__) @@ -53,6 +55,10 @@ def production_line_qa_batch_inference(): preprocess_pipeline=model.get_artifact("preprocess_pipeline"), target=target, ) + + ########## Model Explainability stage ########## + explain_model(df_inference) + ########## DataQuality stage ########## report, _ = evidently_report_step( reference_dataset=model.get_artifact("dataset_trn"), diff --git a/databricks-demo/pipelines/deployment.py b/databricks-production-qa-demo/pipelines/deployment.py similarity index 100% rename from databricks-demo/pipelines/deployment.py rename to databricks-production-qa-demo/pipelines/deployment.py diff --git a/databricks-demo/pipelines/training.py b/databricks-production-qa-demo/pipelines/training.py similarity index 100% rename from databricks-demo/pipelines/training.py rename to databricks-production-qa-demo/pipelines/training.py diff --git a/databricks-demo/requirements.txt b/databricks-production-qa-demo/requirements.txt similarity index 100% rename from databricks-demo/requirements.txt rename to databricks-production-qa-demo/requirements.txt diff --git a/databricks-demo/run.py b/databricks-production-qa-demo/run.py similarity index 100% rename from databricks-demo/run.py rename to databricks-production-qa-demo/run.py diff --git a/databricks-demo/steps/__init__.py b/databricks-production-qa-demo/steps/__init__.py similarity index 100% rename from databricks-demo/steps/__init__.py rename to databricks-production-qa-demo/steps/__init__.py diff --git a/databricks-demo/steps/alerts/__init__.py b/databricks-production-qa-demo/steps/alerts/__init__.py similarity index 100% rename from databricks-demo/steps/alerts/__init__.py rename to databricks-production-qa-demo/steps/alerts/__init__.py diff --git a/databricks-demo/steps/alerts/notify_on.py b/databricks-production-qa-demo/steps/alerts/notify_on.py similarity index 100% rename from databricks-demo/steps/alerts/notify_on.py rename to databricks-production-qa-demo/steps/alerts/notify_on.py diff --git a/databricks-demo/steps/data_quality/__init__.py b/databricks-production-qa-demo/steps/data_quality/__init__.py similarity index 100% rename from databricks-demo/steps/data_quality/__init__.py rename to databricks-production-qa-demo/steps/data_quality/__init__.py diff --git a/databricks-demo/steps/data_quality/drift_quality_gate.py b/databricks-production-qa-demo/steps/data_quality/drift_quality_gate.py similarity index 100% rename from databricks-demo/steps/data_quality/drift_quality_gate.py rename to databricks-production-qa-demo/steps/data_quality/drift_quality_gate.py diff --git a/databricks-demo/steps/deployment/__init__.py b/databricks-production-qa-demo/steps/deployment/__init__.py similarity index 100% rename from databricks-demo/steps/deployment/__init__.py rename to databricks-production-qa-demo/steps/deployment/__init__.py diff --git a/databricks-demo/steps/deployment/deployment_deploy.py b/databricks-production-qa-demo/steps/deployment/deployment_deploy.py similarity index 100% rename from databricks-demo/steps/deployment/deployment_deploy.py rename to databricks-production-qa-demo/steps/deployment/deployment_deploy.py diff --git a/databricks-demo/steps/etl/__init__.py b/databricks-production-qa-demo/steps/etl/__init__.py similarity index 100% rename from databricks-demo/steps/etl/__init__.py rename to databricks-production-qa-demo/steps/etl/__init__.py diff --git a/databricks-demo/steps/etl/data_loader.py b/databricks-production-qa-demo/steps/etl/data_loader.py similarity index 100% rename from databricks-demo/steps/etl/data_loader.py rename to databricks-production-qa-demo/steps/etl/data_loader.py diff --git a/databricks-demo/steps/etl/inference_data_preprocessor.py b/databricks-production-qa-demo/steps/etl/inference_data_preprocessor.py similarity index 100% rename from databricks-demo/steps/etl/inference_data_preprocessor.py rename to databricks-production-qa-demo/steps/etl/inference_data_preprocessor.py diff --git a/databricks-demo/steps/etl/train_data_preprocessor.py b/databricks-production-qa-demo/steps/etl/train_data_preprocessor.py similarity index 100% rename from databricks-demo/steps/etl/train_data_preprocessor.py rename to databricks-production-qa-demo/steps/etl/train_data_preprocessor.py diff --git a/databricks-demo/steps/etl/train_data_splitter.py b/databricks-production-qa-demo/steps/etl/train_data_splitter.py similarity index 100% rename from databricks-demo/steps/etl/train_data_splitter.py rename to databricks-production-qa-demo/steps/etl/train_data_splitter.py diff --git a/databricks-production-qa-demo/steps/explainability/__init__.py b/databricks-production-qa-demo/steps/explainability/__init__.py new file mode 100644 index 00000000..ff7b7487 --- /dev/null +++ b/databricks-production-qa-demo/steps/explainability/__init__.py @@ -0,0 +1 @@ +from .shap_explainer import explain_model \ No newline at end of file diff --git a/databricks-production-qa-demo/steps/explainability/shap_explainer.py b/databricks-production-qa-demo/steps/explainability/shap_explainer.py new file mode 100644 index 00000000..afcf5288 --- /dev/null +++ b/databricks-production-qa-demo/steps/explainability/shap_explainer.py @@ -0,0 +1,31 @@ +from sklearn.base import ClassifierMixin +from zenml import get_step_context, log_artifact_metadata +import shap +import pandas as pd +from typing import Annotated +from zenml.steps import step +from .shap_visualization import SHAPVisualization + +@step +def explain_model( + X_train: pd.DataFrame +) -> Annotated[SHAPVisualization, "shap_visualization"]: + """Generate SHAP values for model explainability and create a visualization.""" + model = get_step_context().model + model_artifact: ClassifierMixin = model.load_artifact("model") + + explainer = shap.KernelExplainer(model_artifact.predict_proba, shap.sample(X_train, 100)) + shap_values = explainer.shap_values(X_train.iloc[:100]) + + log_artifact_metadata( + artifact_name="shap_values", + metadata={ + "shap_info": { + "shape": [arr.shape for arr in shap_values], + "n_classes": len(shap_values), + "n_features": shap_values[0].shape[1], + } + } + ) + + return SHAPVisualization(shap_values, X_train.columns) \ No newline at end of file diff --git a/databricks-production-qa-demo/steps/explainability/shap_visualization.py b/databricks-production-qa-demo/steps/explainability/shap_visualization.py new file mode 100644 index 00000000..b0c48f64 --- /dev/null +++ b/databricks-production-qa-demo/steps/explainability/shap_visualization.py @@ -0,0 +1,41 @@ +import os +import io +from typing import Dict + +import shap +import matplotlib.pyplot as plt + +from zenml.enums import ArtifactType, VisualizationType +from zenml.io import fileio +from zenml.materializers.base_materializer import BaseMaterializer + +# Custom class to hold SHAP visualization data +class SHAPVisualization: + def __init__(self, shap_values, feature_names): + self.shap_values = shap_values + self.feature_names = feature_names + + +# Custom materializer for SHAPVisualization +class SHAPVisualizationMaterializer(BaseMaterializer): + ASSOCIATED_TYPES = (SHAPVisualization,) + ASSOCIATED_ARTIFACT_TYPE = ArtifactType.DATA_ANALYSIS + + def save_visualizations( + self, data: SHAPVisualization + ) -> Dict[str, VisualizationType]: + plt.figure(figsize=(10, 6)) + shap.summary_plot(data.shap_values, feature_names=data.feature_names, plot_type="bar", show=False) + plt.title("SHAP Feature Importance") + + buf = io.BytesIO() + plt.savefig(buf, format='png', dpi=150, bbox_inches='tight') + buf.seek(0) + + visualization_path = os.path.join(self.uri, "shap_summary_plot.png") + with fileio.open(visualization_path, 'wb') as f: + f.write(buf.getvalue()) + + plt.close() + + return {visualization_path: VisualizationType.IMAGE} diff --git a/databricks-demo/steps/hp_tuning/__init__.py b/databricks-production-qa-demo/steps/hp_tuning/__init__.py similarity index 100% rename from databricks-demo/steps/hp_tuning/__init__.py rename to databricks-production-qa-demo/steps/hp_tuning/__init__.py diff --git a/databricks-demo/steps/hp_tuning/hp_tuning_select_best_model.py b/databricks-production-qa-demo/steps/hp_tuning/hp_tuning_select_best_model.py similarity index 92% rename from databricks-demo/steps/hp_tuning/hp_tuning_select_best_model.py rename to databricks-production-qa-demo/steps/hp_tuning/hp_tuning_select_best_model.py index 7d5a6bc3..b69d57a3 100644 --- a/databricks-demo/steps/hp_tuning/hp_tuning_select_best_model.py +++ b/databricks-production-qa-demo/steps/hp_tuning/hp_tuning_select_best_model.py @@ -27,7 +27,7 @@ logger = get_logger(__name__) -@step +@step(enable_cache=False) def hp_tuning_select_best_model( step_names: List[str], ) -> Annotated[ClassifierMixin, "best_model"]: @@ -47,10 +47,10 @@ def hp_tuning_select_best_model( best_metric = -1 # consume artifacts attached to current model version in Model Control Plane for step_name in step_names: - hp_output = model.get_data_artifact("hp_result") + hp_output = model.get_model_artifact("hp_result") model_: ClassifierMixin = hp_output.load() # fetch metadata we attached earlier - metric = float(hp_output.run_metadata["metric"].value) + metric = float(hp_output.run_metadata["metric"]) if best_model is None or best_metric < metric: best_model = model_ ### YOUR CODE ENDS HERE ### diff --git a/databricks-demo/steps/hp_tuning/hp_tuning_single_search.py b/databricks-production-qa-demo/steps/hp_tuning/hp_tuning_single_search.py similarity index 100% rename from databricks-demo/steps/hp_tuning/hp_tuning_single_search.py rename to databricks-production-qa-demo/steps/hp_tuning/hp_tuning_single_search.py diff --git a/databricks-demo/steps/inference/__init__.py b/databricks-production-qa-demo/steps/inference/__init__.py similarity index 100% rename from databricks-demo/steps/inference/__init__.py rename to databricks-production-qa-demo/steps/inference/__init__.py diff --git a/databricks-demo/steps/inference/inference_predict.py b/databricks-production-qa-demo/steps/inference/inference_predict.py similarity index 100% rename from databricks-demo/steps/inference/inference_predict.py rename to databricks-production-qa-demo/steps/inference/inference_predict.py diff --git a/databricks-demo/steps/promotion/__init__.py b/databricks-production-qa-demo/steps/promotion/__init__.py similarity index 100% rename from databricks-demo/steps/promotion/__init__.py rename to databricks-production-qa-demo/steps/promotion/__init__.py diff --git a/databricks-demo/steps/promotion/compute_performance_metrics_on_current_data.py b/databricks-production-qa-demo/steps/promotion/compute_performance_metrics_on_current_data.py similarity index 100% rename from databricks-demo/steps/promotion/compute_performance_metrics_on_current_data.py rename to databricks-production-qa-demo/steps/promotion/compute_performance_metrics_on_current_data.py diff --git a/databricks-demo/steps/promotion/promote_with_metric_compare.py b/databricks-production-qa-demo/steps/promotion/promote_with_metric_compare.py similarity index 98% rename from databricks-demo/steps/promotion/promote_with_metric_compare.py rename to databricks-production-qa-demo/steps/promotion/promote_with_metric_compare.py index d68409d2..57294c04 100644 --- a/databricks-demo/steps/promotion/promote_with_metric_compare.py +++ b/databricks-production-qa-demo/steps/promotion/promote_with_metric_compare.py @@ -89,14 +89,14 @@ def promote_with_metric_compare( # Promote in Model Registry latest_version_model_registry_number = latest_version.run_metadata[ "model_registry_version" - ].value + ] if current_version_number is None: current_version_model_registry_number = ( latest_version_model_registry_number ) else: current_version_model_registry_number = ( - current_version.run_metadata["model_registry_version"].value + current_version.run_metadata["model_registry_version"] ) promote_in_model_registry( latest_version=latest_version_model_registry_number, @@ -108,7 +108,7 @@ def promote_with_metric_compare( else: promoted_version = current_version.run_metadata[ "model_registry_version" - ].value + ] logger.info( f"Current model version in `{target_env}` is `{promoted_version}` registered in Model Registry" diff --git a/databricks-demo/steps/training/__init__.py b/databricks-production-qa-demo/steps/training/__init__.py similarity index 100% rename from databricks-demo/steps/training/__init__.py rename to databricks-production-qa-demo/steps/training/__init__.py diff --git a/databricks-demo/steps/training/model_evaluator.py b/databricks-production-qa-demo/steps/training/model_evaluator.py similarity index 97% rename from databricks-demo/steps/training/model_evaluator.py rename to databricks-production-qa-demo/steps/training/model_evaluator.py index 51d3f62e..a4b0d070 100644 --- a/databricks-demo/steps/training/model_evaluator.py +++ b/databricks-production-qa-demo/steps/training/model_evaluator.py @@ -2,7 +2,7 @@ import mlflow import pandas as pd from sklearn.base import ClassifierMixin -from zenml import step, get_step_context, log_model_metadata +from zenml import step, get_step_context, log_metadata from zenml.client import Client from zenml.logger import get_logger from PIL import Image, ImageDraw, ImageFont, ImageFilter @@ -34,13 +34,14 @@ def model_evaluator( step_context = get_step_context() - log_model_metadata( + log_metadata( metadata={ "evaluation_metrics": { "train_accuracy": trn_acc, "test_accuracy": tst_acc } }, + infer_model=True ) # Fetch previous versions (same as before) @@ -49,7 +50,7 @@ def model_evaluator( for version in client.get_model(step_context.model.name).versions: version_obj = client.get_model_version(step_context.model.name, version.version) if "evaluation_metrics" in version_obj.run_metadata: - test_accuracy = version_obj.run_metadata["evaluation_metrics"].value.get("test_accuracy") + test_accuracy = version_obj.run_metadata["evaluation_metrics"].get("test_accuracy") if test_accuracy is not None: previous_versions.append((f"v{version.version}", float(test_accuracy))) diff --git a/databricks-demo/steps/training/model_trainer.py b/databricks-production-qa-demo/steps/training/model_trainer.py similarity index 96% rename from databricks-demo/steps/training/model_trainer.py rename to databricks-production-qa-demo/steps/training/model_trainer.py index 469a3510..51e00aab 100644 --- a/databricks-demo/steps/training/model_trainer.py +++ b/databricks-production-qa-demo/steps/training/model_trainer.py @@ -22,6 +22,7 @@ from zenml import ArtifactConfig, get_step_context, step from zenml.client import Client +from zenml.enums import ArtifactType from zenml.integrations.mlflow.experiment_trackers import ( MLFlowExperimentTracker, ) @@ -50,7 +51,7 @@ def model_trainer( target: str, name: str, ) -> Annotated[ - ClassifierMixin, ArtifactConfig(name="model", is_model_artifact=True) + ClassifierMixin, ArtifactConfig(name="model", artifact_type=ArtifactType.MODEL) ]: """Configure and train a model on the training dataset. diff --git a/databricks-demo/utils/__init__.py b/databricks-production-qa-demo/utils/__init__.py similarity index 100% rename from databricks-demo/utils/__init__.py rename to databricks-production-qa-demo/utils/__init__.py diff --git a/databricks-demo/utils/get_model_from_config.py b/databricks-production-qa-demo/utils/get_model_from_config.py similarity index 100% rename from databricks-demo/utils/get_model_from_config.py rename to databricks-production-qa-demo/utils/get_model_from_config.py diff --git a/databricks-demo/utils/preprocess.py b/databricks-production-qa-demo/utils/preprocess.py similarity index 100% rename from databricks-demo/utils/preprocess.py rename to databricks-production-qa-demo/utils/preprocess.py diff --git a/databricks-demo/utils/promote_in_model_registry.py b/databricks-production-qa-demo/utils/promote_in_model_registry.py similarity index 100% rename from databricks-demo/utils/promote_in_model_registry.py rename to databricks-production-qa-demo/utils/promote_in_model_registry.py diff --git a/explainability-shap/.assets/model.gif b/explainability-shap/.assets/model.gif deleted file mode 100644 index f48a8adb..00000000 Binary files a/explainability-shap/.assets/model.gif and /dev/null differ diff --git a/explainability-shap/.assets/shap_visualization.png b/explainability-shap/.assets/shap_visualization.png deleted file mode 100644 index cd8dd4d9..00000000 Binary files a/explainability-shap/.assets/shap_visualization.png and /dev/null differ diff --git a/explainability-shap/.dockerignore b/explainability-shap/.dockerignore deleted file mode 100644 index cb70b74a..00000000 --- a/explainability-shap/.dockerignore +++ /dev/null @@ -1,4 +0,0 @@ -!/materializers/** -!/pipelines/** -!/steps/** -!/utils/** diff --git a/explainability-shap/LICENSE b/explainability-shap/LICENSE deleted file mode 100644 index 75d01fb4..00000000 --- a/explainability-shap/LICENSE +++ /dev/null @@ -1,15 +0,0 @@ -Apache Software License 2.0 - -Copyright (c) ZenML GmbH 2024. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. diff --git a/explainability-shap/README.md b/explainability-shap/README.md deleted file mode 100644 index fbe582cf..00000000 --- a/explainability-shap/README.md +++ /dev/null @@ -1,81 +0,0 @@ -# 🌸 Iris Classification MLOps Pipeline with ZenML - -Welcome to the Iris Classification MLOps project! This project demonstrates how to build a production-ready machine learning pipeline using ZenML. It showcases various MLOps practices including data preparation, model training, evaluation, explainability, and data drift detection. - -## 🌟 Features - -- Data loading and splitting using scikit-learn's iris dataset -- SVM model training with hyperparameter configuration -- Model evaluation with accuracy metrics -- Model explainability using SHAP (SHapley Additive exPlanations) -- Data drift detection between training and test sets -- Artifact and metadata logging for enhanced traceability - -
-
- Iris Classification Pipeline -
-
- -## 🏃 How to Run - -Before running the pipeline, set up your environment as follows: - -```bash -# Set up a Python virtual environment -python3 -m venv .venv -source .venv/bin/activate - -# Install requirements -pip install -r requirements.txt -``` - -To run the Iris Classification pipeline: - -```shell -python run.py -``` - -## 🧩 Pipeline Steps - -1. **Load Data**: Loads the iris dataset and splits it into train and test sets. -2. **Train Model**: Trains an SVM classifier on the training data. -3. **Evaluate Model**: Evaluates the model on the test set and generates predictions. -4. **Explain Model**: Generates SHAP values for model explainability. -5. **Detect Data Drift**: Detects potential data drift between training and test sets. - -## 📊 Visualizations - -The pipeline generates a SHAP summary plot to explain feature importance: - -
-
- SHAP Summary Plot -
-
- -## 🛠️ Customization - -You can customize various aspects of the pipeline: - -- Adjust the `SVC` hyperparameters in the `train_model` step -- Modify the train-test split ratio in the `load_data` step -- Add or remove features from the iris dataset -- Implement additional evaluation metrics in the `evaluate_model` step - -## 📜 Project Structure - -``` -. -├── run.py # Main pipeline file -├── requirements.txt # Python dependencies -└── README.md # This file -``` - -## 🤝 Contributing - -Contributions to improve the pipeline are welcome! Please feel free to submit a Pull Request. - -## 📄 License - -This project is licensed under the Apache License 2.0. See the LICENSE file for details. \ No newline at end of file diff --git a/explainability-shap/requirements.txt b/explainability-shap/requirements.txt deleted file mode 100644 index da6d548a..00000000 --- a/explainability-shap/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -scikit-learn -shap -matplotlib -scipy -zenml>=0.70.0 -pyarrow -fastparquet -numpy<2.0.0 \ No newline at end of file diff --git a/explainability-shap/run.ipynb b/explainability-shap/run.ipynb deleted file mode 100644 index b1026ef9..00000000 --- a/explainability-shap/run.ipynb +++ /dev/null @@ -1,373 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Iris Classification Pipeline with ZenML\n", - "\n", - "This notebook demonstrates a ZenML pipeline for iris classification, including data loading, model training, evaluation, explainability, and data drift detection." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!zenml login https://d13d987c-zenml.cloudinfra.zenml.io" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from zenml.client import Client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "Client().activate_stack(\"default_with_s3\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!zenml stack describe 'local-aws-step-operator'\n", - "Client().activate_stack('local-aws-step-operator')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# !zenml stack describe 'aws-sagemaker-pipelines'\n", - "# Client().activate_stack('aws-sagemaker-pipelines')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.datasets import load_iris\n", - "from sklearn.model_selection import train_test_split\n", - "from zenml import step, log_metadata\n", - "from typing import Tuple, Dict, Any\n", - "from typing_extensions import Annotated\n", - "\n", - "def safe_metadata(data: Any) -> Dict[str, Any]:\n", - " \"\"\"Create metadata dict with only supported types.\"\"\"\n", - " metadata = {\"shape\": data.shape}\n", - " if isinstance(data, pd.DataFrame):\n", - " metadata[\"columns\"] = list(data.columns)\n", - " return metadata\n", - "\n", - "\n", - "@step\n", - "def load_data() -> Tuple[\n", - " Annotated[pd.DataFrame, \"X_train\"],\n", - " Annotated[pd.DataFrame, \"X_test\"],\n", - " Annotated[pd.Series, \"y_train\"],\n", - " Annotated[pd.Series, \"y_test\"],\n", - "]:\n", - " \"\"\"Load the iris dataset and split into train and test sets.\"\"\"\n", - " iris = load_iris(as_frame=True)\n", - " X = iris.data\n", - " y = iris.target\n", - " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,\n", - " random_state=42)\n", - "\n", - " for name, data in [(\"X_train\", X_train), (\"X_test\", X_test),\n", - " (\"y_train\", y_train), (\"y_test\", y_test)]:\n", - " log_metadata(\n", - " artifact_name=name,\n", - " metadata={\"dataset_info\": safe_metadata(data)},\n", - " infer_artifact=True,\n", - " )\n", - "\n", - " return X_train, X_test, y_train, y_test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from sklearn.svm import SVC\n", - "from zenml import step, ArtifactConfig, log_metadata\n", - "from typing_extensions import Annotated\n", - "from zenml.config import ResourceSettings\n", - "from zenml.integrations.aws.flavors.sagemaker_step_operator_flavor import SagemakerStepOperatorSettings\n", - "from zenml.enums import ArtifactType\n", - "\n", - "@step(\n", - " enable_cache=False,\n", - " step_operator=\"aws-sagemaker-pipelines\",\n", - " settings={\n", - " \"step_operator.sagemaker\": SagemakerStepOperatorSettings(estimator_args={\"instance_type\": \"ml.p3.2xlarge\"})\n", - " }\n", - ")\n", - "def train_model(\n", - " X_train: pd.DataFrame,\n", - " y_train: pd.Series,\n", - ") -> Annotated[SVC, ArtifactConfig(name=\"model\", artifact_type=ArtifactType.MODEL)]:\n", - " \"\"\"Train an SVM classifier.\"\"\"\n", - " model = SVC(kernel='rbf', probability=True)\n", - " model.fit(X_train, y_train)\n", - " train_accuracy = model.score(X_train, y_train)\n", - "\n", - " log_metadata(\n", - " metadata={\n", - " \"training_metrics\": {\n", - " \"train_accuracy\": float(train_accuracy),\n", - " },\n", - " \"model_info\": {\n", - " \"model_type\": type(model).__name__,\n", - " \"kernel\": model.kernel,\n", - " }\n", - " },\n", - " infer_model=True,\n", - " )\n", - "\n", - " log_metadata(\n", - " metadata={\n", - " \"model_details\": {\n", - " \"type\": type(model).__name__,\n", - " \"kernel\": model.kernel,\n", - " \"n_support\": model.n_support_.tolist(),\n", - " }\n", - " },\n", - " artifact_name=\"model\",\n", - " infer_artifact=True,\n", - " )\n", - "\n", - " return model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np\n", - "from sklearn.svm import SVC\n", - "from zenml import step, log_metadata\n", - "from typing import Tuple\n", - "from typing_extensions import Annotated\n", - "\n", - "@step\n", - "def evaluate_model(\n", - " model: SVC,\n", - " X_test: pd.DataFrame,\n", - " y_test: pd.Series,\n", - ") -> Tuple[\n", - " Annotated[np.ndarray, \"predictions\"],\n", - " Annotated[np.ndarray, \"probabilities\"]\n", - "]:\n", - " \"\"\"Evaluate the model and make predictions.\"\"\"\n", - " test_accuracy = model.score(X_test, y_test)\n", - " predictions = model.predict(X_test)\n", - " probabilities = model.predict_proba(X_test)\n", - "\n", - " log_metadata(\n", - " metadata={\n", - " \"evaluation_metrics\": {\n", - " \"test_accuracy\": float(test_accuracy),\n", - " }\n", - " },\n", - " infer_model=True,\n", - " )\n", - "\n", - " log_metadata(\n", - " metadata={\n", - " \"prediction_info\": {\n", - " \"shape\": predictions.shape,\n", - " \"unique_values\": np.unique(predictions).tolist()\n", - " }\n", - " },\n", - " artifact_name=\"predictions\",\n", - " infer_artifact=True,\n", - " )\n", - "\n", - " log_metadata(\n", - " metadata={\n", - " \"probability_info\": {\n", - " \"shape\": probabilities.shape,\n", - " \"min\": float(np.min(probabilities)),\n", - " \"max\": float(np.max(probabilities))\n", - " }\n", - " },\n", - " artifact_name=\"probabilities\",\n", - " infer_artifact=True,\n", - " )\n", - "\n", - " return predictions, probabilities" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import io\n", - "import pandas as pd\n", - "import shap\n", - "from sklearn.svm import SVC\n", - "from zenml import step, log_metadata\n", - "from typing import Dict\n", - "from typing_extensions import Annotated\n", - "import matplotlib.pyplot as plt\n", - "\n", - "from zenml.enums import ArtifactType, VisualizationType\n", - "from zenml.io import fileio\n", - "from zenml.materializers.base_materializer import BaseMaterializer\n", - "\n", - "\n", - "class SHAPVisualization:\n", - " def __init__(self, shap_values, feature_names):\n", - " self.shap_values = shap_values\n", - " self.feature_names = feature_names\n", - "\n", - "@step\n", - "def explain_model(\n", - " model: SVC,\n", - " X_train: pd.DataFrame\n", - ") -> Annotated[SHAPVisualization, \"shap_visualization\"]:\n", - " \"\"\"Generate SHAP values for model explainability and create a visualization.\"\"\"\n", - " explainer = shap.KernelExplainer(\n", - " model.predict_proba,\n", - " shap.sample(X_train, 100)\n", - " )\n", - " shap_values = explainer.shap_values(X_train.iloc[:100])\n", - "\n", - " log_metadata(\n", - " metadata={\n", - " \"shap_info\": {\n", - " \"shape\": [arr.shape for arr in shap_values],\n", - " \"n_classes\": len(shap_values),\n", - " \"n_features\": shap_values[0].shape[1],\n", - " }\n", - " },\n", - " artifact_name=\"shap_visualization\",\n", - " infer_artifact=True,\n", - " )\n", - "\n", - " return SHAPVisualization(shap_values, X_train.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from scipy.stats import ks_2samp\n", - "from zenml import step\n", - "from zenml import log_artifact_metadata\n", - "from typing import Dict\n", - "from typing_extensions import Annotated\n", - "\n", - "@step\n", - "def detect_data_drift(\n", - " X_train: pd.DataFrame,\n", - " X_test: pd.DataFrame,\n", - ") -> Annotated[Dict[str, float], \"drift_metrics\"]:\n", - " \"\"\"Detect data drift between training and test sets.\"\"\"\n", - " drift_metrics = {}\n", - " for column in X_train.columns:\n", - " _, p_value = ks_2samp(X_train[column], X_test[column])\n", - " drift_metrics[column] = p_value\n", - "\n", - " log_metadata(\n", - " metadata={\n", - " \"drift_summary\": {\n", - " \"high_drift_features\": [col for col, p in drift_metrics.items() if p < 0.05]\n", - " }\n", - " },\n", - " artifact_name=\"drift_metrics\",\n", - " infer_artifact=True,\n", - " )\n", - "\n", - " return drift_metrics" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from zenml import pipeline, Model\n", - "from zenml.config import DockerSettings\n", - "\n", - "@pipeline(\n", - " settings={\n", - " # \"docker\": DockerSettings(python_package_installer=\"uv\",\n", - " # requirements=\"requirements.txt\"),\n", - " # \"resources\": ResourceSettings(memory=\"8GB\"),\n", - " },\n", - " model=Model(name=\"high_risk_classification\")\n", - ")\n", - "def iris_classification_pipeline():\n", - " X_train, X_test, y_train, y_test = load_data()\n", - " model = train_model(X_train, y_train)\n", - " evaluate_model(model, X_test, y_test)\n", - " explain_model(model, X_train)\n", - " drift_metrics = detect_data_drift(X_train, X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run the pipeline\n", - "pipeline_run = iris_classification_pipeline()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/explainability-shap/run.py b/explainability-shap/run.py deleted file mode 100644 index 5a48e90d..00000000 --- a/explainability-shap/run.py +++ /dev/null @@ -1,257 +0,0 @@ -# Apache Software License 2.0 -# -# Copyright (c) ZenML GmbH 2024. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import io -import os -from typing import Tuple, Dict, Any - -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import shap -from scipy.stats import ks_2samp -from sklearn.datasets import load_iris -from sklearn.model_selection import train_test_split -from sklearn.svm import SVC -from typing_extensions import Annotated -from zenml import pipeline, step, Model, ArtifactConfig, log_metadata -from zenml.config import DockerSettings -from zenml.enums import ArtifactType, VisualizationType -from zenml.io import fileio -from zenml.logger import get_logger -from zenml.materializers.base_materializer import BaseMaterializer - -logger = get_logger(__name__) - - -# Custom class to hold SHAP visualization data -class SHAPVisualization: - def __init__(self, shap_values, feature_names): - self.shap_values = shap_values - self.feature_names = feature_names - - -# Custom materializer for SHAPVisualization -class SHAPVisualizationMaterializer(BaseMaterializer): - ASSOCIATED_TYPES = (SHAPVisualization,) - ASSOCIATED_ARTIFACT_TYPE = ArtifactType.DATA_ANALYSIS - - def save_visualizations( - self, data: SHAPVisualization - ) -> Dict[str, VisualizationType]: - plt.figure(figsize=(10, 6)) - shap.summary_plot(data.shap_values, feature_names=data.feature_names, - plot_type="bar", show=False) - plt.title("SHAP Feature Importance") - - buf = io.BytesIO() - plt.savefig(buf, format='png', dpi=150, bbox_inches='tight') - buf.seek(0) - - visualization_path = os.path.join(self.uri, "shap_summary_plot.png") - with fileio.open(visualization_path, 'wb') as f: - f.write(buf.getvalue()) - - plt.close() - - return {visualization_path: VisualizationType.IMAGE} - - -def safe_metadata(data: Any) -> Dict[str, Any]: - """Create metadata dict with only supported types.""" - metadata = {"shape": data.shape} - if isinstance(data, pd.DataFrame): - metadata["columns"] = list(data.columns) - return metadata - - -@step -def load_data() -> Tuple[ - Annotated[pd.DataFrame, "X_train"], - Annotated[pd.DataFrame, "X_test"], - Annotated[pd.Series, "y_train"], - Annotated[pd.Series, "y_test"], -]: - """Load the iris dataset and split into train and test sets.""" - iris = load_iris(as_frame=True) - X = iris.data - y = iris.target - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, - random_state=42) - - for name, data in [("X_train", X_train), ("X_test", X_test), - ("y_train", y_train), ("y_test", y_test)]: - log_metadata( - artifact_name=name, - metadata={"dataset_info": safe_metadata(data)}, - infer_artifact=True, - ) - - return X_train, X_test, y_train, y_test - - -@step -def train_model( - X_train: pd.DataFrame, - y_train: pd.Series, -) -> Annotated[SVC, ArtifactConfig(name="model", artifact_type=ArtifactType.MODEL)]: - """Train an SVM classifier.""" - model = SVC(kernel='rbf', probability=True) - model.fit(X_train, y_train) - train_accuracy = model.score(X_train, y_train) - - log_metadata( - metadata={ - "training_metrics": { - "train_accuracy": float(train_accuracy), - }, - "model_info": { - "model_type": type(model).__name__, - "kernel": model.kernel, - } - }, - infer_model=True, - ) - - log_metadata( - metadata={ - "model_details": { - "type": type(model).__name__, - "kernel": model.kernel, - "n_support": model.n_support_.tolist(), - } - }, - artifact_name="model", - infer_artifact=True, - ) - - return model - - -@step -def evaluate_model( - model: SVC, - X_test: pd.DataFrame, - y_test: pd.Series, -) -> Tuple[ - Annotated[np.ndarray, "predictions"], - Annotated[np.ndarray, "probabilities"] -]: - """Evaluate the model and make predictions.""" - test_accuracy = model.score(X_test, y_test) - predictions = model.predict(X_test) - probabilities = model.predict_proba(X_test) - - log_metadata( - metadata={ - "evaluation_metrics": { - "test_accuracy": float(test_accuracy), - } - }, - infer_model=True, - ) - - log_metadata( - metadata={ - "prediction_info": { - "shape": predictions.shape, - "unique_values": np.unique(predictions).tolist() - } - }, - artifact_name="predictions", - infer_artifact=True, - ) - - log_metadata( - metadata={ - "probability_info": { - "shape": probabilities.shape, - "min": float(np.min(probabilities)), - "max": float(np.max(probabilities)) - } - }, - artifact_name="probabilities", - infer_artifact=True, - ) - - return predictions, probabilities - - -@step -def explain_model( - model: SVC, - X_train: pd.DataFrame -) -> Annotated[SHAPVisualization, "shap_visualization"]: - """Generate SHAP values for model explainability and create a visualization.""" - explainer = shap.KernelExplainer(model.predict_proba, - shap.sample(X_train, 100)) - shap_values = explainer.shap_values(X_train.iloc[:100]) - - log_metadata( - metadata={ - "shap_info": { - "shape": [arr.shape for arr in shap_values], - "n_classes": len(shap_values), - "n_features": shap_values[0].shape[1], - } - }, - artifact_name="shap_visualization", - infer_artifact=True, - ) - - return SHAPVisualization(shap_values, X_train.columns) - - -@step -def detect_data_drift( - X_train: pd.DataFrame, - X_test: pd.DataFrame, -) -> Annotated[Dict[str, float], "drift_metrics"]: - """Detect data drift between training and test sets.""" - drift_metrics = {} - for column in X_train.columns: - _, p_value = ks_2samp(X_train[column], X_test[column]) - drift_metrics[column] = p_value - - log_metadata( - metadata={ - "drift_summary": { - "high_drift_features": [col for col, p in drift_metrics.items() - if p < 0.05] - } - }, - artifact_name="drift_metrics", - infer_artifact=True, - ) - - return drift_metrics - - -@pipeline( - enable_cache=False, - settings={"docker": DockerSettings(requirements="requirements.txt")}, - model=Model(name="high_risk_classification") -) -def iris_classification_pipeline(): - X_train, X_test, y_train, y_test = load_data() - model = train_model(X_train, y_train) - evaluate_model(model, X_test, y_test) - explain_model(model, X_train) - drift_metrics = detect_data_drift(X_train, X_test) - - -if __name__ == "__main__": - iris_classification_pipeline() diff --git a/huggingface-sagemaker/steps/tokenizer_loader/tokenizer_loader.py b/huggingface-sagemaker/steps/tokenizer_loader/tokenizer_loader.py index faf80386..21c05331 100644 --- a/huggingface-sagemaker/steps/tokenizer_loader/tokenizer_loader.py +++ b/huggingface-sagemaker/steps/tokenizer_loader/tokenizer_loader.py @@ -18,6 +18,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase from typing_extensions import Annotated from zenml import ArtifactConfig, step +from zenml.enums import ArtifactType from zenml.logger import get_logger logger = get_logger(__name__) @@ -28,7 +29,7 @@ def tokenizer_loader( lower_case: bool, ) -> Annotated[ PreTrainedTokenizerBase, - ArtifactConfig(name="base_tokenizer", is_model_artifact=True), + ArtifactConfig(name="base_tokenizer", artifact_type=ArtifactType.MODEL), ]: """Tokenizer selection step. diff --git a/huggingface-sagemaker/steps/training/model_trainer.py b/huggingface-sagemaker/steps/training/model_trainer.py index bbd180fd..0934336c 100644 --- a/huggingface-sagemaker/steps/training/model_trainer.py +++ b/huggingface-sagemaker/steps/training/model_trainer.py @@ -29,6 +29,7 @@ ) from typing_extensions import Annotated from utils.misc import compute_metrics +from zenml.enums import ArtifactType from zenml import ArtifactConfig, log_artifact_metadata, step from zenml.client import Client from zenml.integrations.mlflow.experiment_trackers import ( @@ -65,11 +66,11 @@ def model_trainer( weight_decay: Optional[float] = 0.01, ) -> Tuple[ Annotated[ - PreTrainedModel, ArtifactConfig(name="model", is_model_artifact=True) + PreTrainedModel, ArtifactConfig(name="model", artifact_type=ArtifactType.MODEL) ], Annotated[ PreTrainedTokenizerBase, - ArtifactConfig(name="tokenizer", is_model_artifact=True), + ArtifactConfig(name="tokenizer", artifact_type=ArtifactType.MODEL), ], ]: """ diff --git a/llm-agents/steps/agent_creator.py b/llm-agents/steps/agent_creator.py index 22a3c489..87ba7160 100644 --- a/llm-agents/steps/agent_creator.py +++ b/llm-agents/steps/agent_creator.py @@ -9,7 +9,7 @@ from pydantic import BaseModel from typing_extensions import Annotated from zenml import ArtifactConfig, log_artifact_metadata, step - +from zenml.enums import ArtifactType PIPELINE_NAME = "zenml_agent_creation_pipeline" # Choose what character to use for your agent's answers CHARACTER = "technical assistant" @@ -33,7 +33,7 @@ class Config: def agent_creator( vector_store: VectorStore, config: AgentParameters = AgentParameters() ) -> Annotated[ - AgentExecutor, ArtifactConfig(name="agent", is_model_artifact=True) + AgentExecutor, ArtifactConfig(name="agent", artifact_type=ArtifactType.MODEL) ]: """Create an agent from a vector store. diff --git a/llm-complete-guide/steps/finetune_embeddings.py b/llm-complete-guide/steps/finetune_embeddings.py index 44a2f707..f0a9996a 100644 --- a/llm-complete-guide/steps/finetune_embeddings.py +++ b/llm-complete-guide/steps/finetune_embeddings.py @@ -51,7 +51,7 @@ from zenml.client import Client from zenml.enums import ArtifactType from zenml.utils.cuda_utils import cleanup_gpu_memory - +from zenml.enums import ArtifactType @step def prepare_load_data( diff --git a/llm-finetuning/steps/trainer.py b/llm-finetuning/steps/trainer.py index 2f053a97..dee5bcbe 100644 --- a/llm-finetuning/steps/trainer.py +++ b/llm-finetuning/steps/trainer.py @@ -37,7 +37,7 @@ from typing_extensions import Annotated from zenml import ArtifactConfig, log_model_metadata, save_artifact, step from zenml.client import Client - +from zenml.enums import ArtifactType # this is expensive so we cache it @functools.lru_cache(maxsize=None) @@ -592,11 +592,11 @@ def trainer( args: Configuration, ) -> Tuple[ Annotated[ - Trainer, ArtifactConfig(name="trainer_obj", is_model_artifact=True) + Trainer, ArtifactConfig(name="trainer_obj", artifact_type=ArtifactType.MODEL) ], Annotated[ GPT2TokenizerFast, - ArtifactConfig(name="tokenizer_obj", is_model_artifact=True), + ArtifactConfig(name="tokenizer_obj", artifact_type=ArtifactType.MODEL), ], Annotated[str, "peft_model_id"], Annotated[ConstantLengthDataset, "train_dataset"], diff --git a/llm-lora-finetuning/steps/finetune.py b/llm-lora-finetuning/steps/finetune.py index 2e362103..df925898 100644 --- a/llm-lora-finetuning/steps/finetune.py +++ b/llm-lora-finetuning/steps/finetune.py @@ -28,6 +28,7 @@ from utils.loaders import load_base_model from utils.tokenizer import load_tokenizer from zenml import step, ArtifactConfig +from zenml.enums import ArtifactType from zenml.logger import get_logger from zenml.materializers import BuiltInMaterializer from zenml.utils.cuda_utils import cleanup_gpu_memory @@ -55,7 +56,7 @@ def finetune( use_fast: bool = True, load_in_4bit: bool = False, load_in_8bit: bool = False, -) -> Annotated[Path, ArtifactConfig(name="ft_model_dir", is_model_artifact=True)]: +) -> Annotated[Path, ArtifactConfig(name="ft_model_dir", artifact_type=ArtifactType.MODEL)]: """Finetune the model using PEFT. Base model will be derived from configure step and finetuned model will diff --git a/native-experiment-tracking/steps/model_trainer.py b/native-experiment-tracking/steps/model_trainer.py index 3179fb27..9e77bbd2 100644 --- a/native-experiment-tracking/steps/model_trainer.py +++ b/native-experiment-tracking/steps/model_trainer.py @@ -33,6 +33,7 @@ step, ) from zenml.client import Client +from zenml.enums import ArtifactType from zenml.logger import get_logger logger = get_logger(__name__) @@ -48,7 +49,7 @@ def model_trainer( ) -> Tuple[ Annotated[ ClassifierMixin, - ArtifactConfig(name="sklearn_classifier", is_model_artifact=True), + ArtifactConfig(name="sklearn_classifier", artifact_type=ArtifactType.MODEL), ], Annotated[Image.Image, "confusion_matrix"], ]: diff --git a/train_and_deploy/steps/deployment/bento_builder.py b/train_and_deploy/steps/deployment/bento_builder.py index c53c7b7f..3ed235f1 100644 --- a/train_and_deploy/steps/deployment/bento_builder.py +++ b/train_and_deploy/steps/deployment/bento_builder.py @@ -18,6 +18,7 @@ from typing_extensions import Annotated from zenml import ArtifactConfig, get_step_context, step, __version__ as zenml_version +from zenml.enums import ArtifactType from zenml.integrations.bentoml.steps import bento_builder_step from zenml.client import Client from zenml.logger import get_logger @@ -35,7 +36,7 @@ def bento_builder() -> ( Annotated[ Optional[bento.Bento], - ArtifactConfig(name="mlflow_deployment", is_model_artifact=True), + ArtifactConfig(name="mlflow_deployment", artifact_type=ArtifactType.MODEL), ] ): """Predictions step. diff --git a/train_and_deploy/steps/training/model_trainer.py b/train_and_deploy/steps/training/model_trainer.py index 8d2fdb2d..7c1f8f6c 100644 --- a/train_and_deploy/steps/training/model_trainer.py +++ b/train_and_deploy/steps/training/model_trainer.py @@ -22,8 +22,9 @@ from zenml import log_model_metadata from zenml.metadata.metadata_types import Uri -from zenml import ArtifactConfig, get_step_context, step +from zenml import ArtifactConfig, step from zenml.client import Client +from zenml.enums import ArtifactType from zenml.integrations.mlflow.experiment_trackers import ( MLFlowExperimentTracker, ) @@ -52,7 +53,7 @@ def model_trainer( name: str, target: str, ) -> Annotated[ - ClassifierMixin, ArtifactConfig(name="model", is_model_artifact=True) + ClassifierMixin, ArtifactConfig(name="model", artifact_type=ArtifactType.MODEL) ]: """Configure and train a model on the training dataset.