zenml-io
diff --git a/‎native-experiment-tracking/README.md‎
Lines changed: 23 additions & 1 deletion b/‎native-experiment-tracking/README.md‎
Lines changed: 23 additions & 1 deletion
diff --git a/‎native-experiment-tracking/analyze.py‎
Lines changed: 63 additions & 2 deletions b/‎native-experiment-tracking/analyze.py‎
Lines changed: 63 additions & 2 deletions
diff --git a/‎native-experiment-tracking/configs/feature_engineering.yaml‎
Lines changed: 11 additions & 0 deletions b/‎native-experiment-tracking/configs/feature_engineering.yaml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎native-experiment-tracking/pipelines/training.py‎
Lines changed: 7 additions & 10 deletions b/‎native-experiment-tracking/pipelines/training.py‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎native-experiment-tracking/run.py‎
Lines changed: 66 additions & 22 deletions b/‎native-experiment-tracking/run.py‎
Lines changed: 66 additions & 22 deletions
@@ -4,6 +4,28 @@ Although ZenML plugs into many [experiment trackers](https://www.zenml.io/vs/zen
 the functionality of experiment trackers is already covered by ZenML's native metadata and artifact tracking.
 This project aims to show these capabilities.
 
+## 🎯 Project Overview
+We're tackling a simple classification task using the breast cancer dataset. Our goal is to showcase how ZenML can effortlessly track experiments, hyperparameters, and results throughout the machine learning workflow.
+### 🔍 What We're Doing
+
+In this project, we begin by preparing the breast cancer dataset for our model through data preprocessing. For our machine learning task, we've chosen to use an SGDClassifier. Rather than relying on sklearn's GridSearchCV, we implement our own hyperparameter tuning process to showcase ZenML's robust tracking capabilities. Finally, we conduct a thorough analysis of the results, visualizing how various hyperparameters influence the model's accuracy. This approach allows us to demonstrate the power of ZenML in tracking and managing the machine learning workflow.
+
+We are by no means claiming that our solution outperforms GridSearchCV, spoiler alert, this demo won't, rather, this project demonstrates how you would do hyperparameter tuning and experiment tracking  with ZenML on large deep learning problems. 
+
+### 🛠 The Pipeline
+Our ZenML pipeline consists of the following steps:
+
+The feature_engineering pipeline:
+* Data Loading: Load the breast cancer dataset.
+* Data Splitting: Split the data into training and testing sets.
+* Data Pre Processing: Pre process our dataset
+
+The model training pipeline:
+* Model Training: Train multiple SGDClassifiers with different hyperparameters.
+* Model Evaluation: Evaluate each model's performance.
+
+By running this pipeline iteratively 
+
 ## :running: Run locally
 
 
@@ -26,7 +48,7 @@ zenml integration install sklearn pandas -y
 zenml init
 ```
 
-## Explore your experiments
+## 📈 Explore your experiments
 
 ...
 
 
@@ -1,6 +1,9 @@
 import numpy as np
 from matplotlib import pyplot as plt
 from zenml.client import Client
+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
 
 
 def main():
@@ -23,10 +26,53 @@ def main():
         test_accuracies.append(mv_metadata.get("test_accuracy", None).value)
         train_accuracies.append(mv_metadata.get("train_accuracy", None).value)
 
-    generate_plot(alpha_values, losses, penalties, test_accuracies)
+    generate_3d_plot(alpha_values, losses, penalties, test_accuracies)
+    generate_2d_plots(alpha_values, losses, penalties, test_accuracies)
 
 
-def generate_plot(alpha_values, losses, penalties, test_accuracies):
+def generate_2d_plots(alpha_values, losses, penalties, test_accuracies):
+    # Convert the data into a DataFrame
+    df = pd.DataFrame({
+        'Alpha': alpha_values,
+        'Loss': losses,
+        'Penalty': penalties,
+        'Accuracy': test_accuracies
+    })
+
+    # Get unique values
+    unique_penalties = df['Penalty'].unique()
+    unique_losses = df['Loss'].unique()
+    unique_alphas = sorted(df['Alpha'].unique())
+
+    # Create a figure with subplots for each penalty
+    fig, axes = plt.subplots(1, len(unique_penalties), figsize=(20, 6), sharey=True)
+    fig.suptitle('Accuracy Heatmap for Different Penalties', fontsize=16)
+
+    for i, penalty in enumerate(unique_penalties):
+        # Filter data for the current penalty
+        df_penalty = df[df['Penalty'] == penalty]
+
+        # Create a pivot table
+        pivot = df_penalty.pivot(index='Loss', columns='Alpha', values='Accuracy')
+
+        # Create heatmap
+        sns.heatmap(pivot, ax=axes[i], cmap='viridis', annot=True, fmt='.3f', cbar=False)
+
+        axes[i].set_title(f'Penalty: {penalty}')
+        axes[i].set_xlabel('Alpha')
+
+        if i == 0:
+            axes[i].set_ylabel('Loss')
+
+    # Add a colorbar to the right of the subplots
+    cbar_ax = fig.add_axes([.92, .15, .02, .7])
+    fig.colorbar(axes[0].collections[0], cax=cbar_ax, label='Accuracy')
+
+    plt.tight_layout(rect=[0, 0, .9, 1])
+    plt.show()
+
+
+def generate_3d_plot(alpha_values, losses, penalties, test_accuracies):
     # Convert losses and penalties to numerical indices
     unique_losses = list(set(losses))
     unique_penalties = list(set(penalties))
@@ -40,6 +86,16 @@ def generate_plot(alpha_values, losses, penalties, test_accuracies):
 
     # Create a scatter plot
     scatter = ax.scatter(alpha_values, loss_indices, penalty_indices, c=test_accuracies, cmap='viridis')
+    # Find the point with the highest accuracy
+    max_accuracy_index = np.argmax(test_accuracies)
+    max_accuracy = test_accuracies[max_accuracy_index]
+    max_alpha = alpha_values[max_accuracy_index]
+    max_loss = losses[max_accuracy_index]
+    max_penalty = penalties[max_accuracy_index]
+
+    # Highlight the point with the highest accuracy
+    ax.scatter([max_alpha], [loss_indices[max_accuracy_index]], [penalty_indices[max_accuracy_index]],
+               c='red', s=100, edgecolors='black', linewidths=2, zorder=10)
 
     # Set labels for each axis
     ax.set_xlabel('Alpha')
@@ -62,6 +118,11 @@ def generate_plot(alpha_values, losses, penalties, test_accuracies):
     # Adjust the viewing angle
     ax.view_init(elev=20, azim=45)
 
+    # Add legend with highest accuracy point description
+    legend_text = f'Highest Accuracy:\nAccuracy: {max_accuracy:.4f}\nAlpha: {max_alpha}\nLoss: {max_loss}\nPenalty: {max_penalty}'
+    ax.text2D(0.05, 0.95, legend_text, transform=ax.transAxes, fontsize=10, verticalalignment='top',
+              bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
+
     # Show the plot
     plt.tight_layout()
     plt.show()
 
@@ -0,0 +1,11 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+      - pandas
+    requirements:
+      - pyarrow
+
+# pipeline configuration
+test_size: 0.35
@@ -16,27 +16,23 @@
 #
 
 from typing import Optional
-from uuid import UUID
-
-from steps import model_evaluator, model_promoter, model_trainer
 
 from pipelines import (
     feature_engineering,
 )
+from steps import model_evaluator, model_trainer
 from zenml import pipeline
-from zenml.client import Client
 from zenml.logger import get_logger
 
-
 logger = get_logger(__name__)
 
 
 @pipeline
 def training(
-    alpha_value: float,
-    penalty: str,
-    loss: str,
-    target: Optional[str] = "target",
+        alpha_value: float,
+        penalty: str,
+        loss: str,
+        target: Optional[str] = "target",
 ):
     """
     Model training pipeline.
@@ -63,9 +59,10 @@ def training(
         dataset_trn=dataset_trn, target=target, alpha_value=alpha_value, penalty=penalty, loss=loss
     )
 
-    acc, _ = model_evaluator(
+    test_acc = model_evaluator(
         model=model,
         dataset_trn=dataset_trn,
         dataset_tst=dataset_tst,
         target=target,
     )
+    return test_acc
@@ -14,15 +14,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+import concurrent
+import multiprocessing
 import os
+import traceback
+from itertools import product
 
 import click
-from sklearn.utils._param_validation import InvalidParameterError
 from zenml import Model
 from zenml.client import Client
 from zenml.logger import get_logger
 
-from pipelines import training
+from pipelines import training, feature_engineering
 
 logger = get_logger(__name__)
 
@@ -34,7 +37,8 @@
     help="Disable caching for the pipeline run.",
 )
 def main(
-        no_cache: bool = False,
+    no_cache: bool = False,
+    parallel: bool = False
 ):
     """Main entry point for the pipeline execution.
 
@@ -47,6 +51,7 @@ def main(
 
     Args:
         no_cache: If `True` cache will be disabled.
+        parallel: If `True` multiprocessing will be used for running hyperparameter tuning in parallel
     """
     client = Client()
     config_path = os.path.join(
@@ -56,25 +61,64 @@ def main(
     )
     enable_cache = not no_cache
 
-    alpha_values = [0.0001, 0.001, 0.01]
-    penalties = ["l2", "l1", "elasticnet"]
-    losses = ["hinge", "squared_hinge", "modified_huber"]
-    for penalty in penalties:
-        for loss in losses:
-            for alpha_value in alpha_values:
-                logger.info(f"Training with alpha: {alpha_value}, penalty: {penalty}, loss: {loss}")
-
-                model = Model(
-                    name="breast_cancer_classifier",
-                    tags=[f"alpha: {alpha_value}", f"penalty: {penalty}", f"loss: {loss}"]
-                )
-                try:
-                    training.with_options(config_path=config_path, enable_cache=enable_cache, model=model)(
-                        alpha_value=alpha_value, penalty=penalty, loss=loss)
-                except RuntimeError:
-                    pass
-                else:
-                    logger.info("Training pipeline finished successfully!\n\n")
+    # Run the feature engineering pipeline, this way all invocations within the training pipelines
+    # will use the cached output from this pipeline
+    feature_engineering()
+
+    # Here is our set of parameters that we want to explore to find the best combination
+    alpha_values = [0.0001, 0.001] # , 0.01]
+    penalties = ["l2", "l1"] # , "elasticnet"]
+    losses = ["hinge", "squared_hinge"] #, "modified_huber"]
+
+    # Lets loop over these
+    # Create a list of all parameter combinations
+    parameter_combinations = list(product(alpha_values, penalties, losses))
+
+    if parallel:
+        parallel_training(config_path, enable_cache, parameter_combinations)
+    else:
+        for alpha_value, penalty, loss in parameter_combinations:
+            train_model(alpha_value, penalty, loss, config_path, enable_cache)
+
+
+def parallel_training(config_path, enable_cache, parameter_combinations):
+    # Determine the number of CPU cores to use
+    num_cores = multiprocessing.cpu_count()
+    # Use ProcessPoolExecutor for CPU-bound tasks
+    with concurrent.futures.ProcessPoolExecutor(max_workers=num_cores) as executor:
+        # Submit all tasks to the executor
+        futures = [executor.submit(train_model, alpha, penalty, loss, config_path, enable_cache)
+                   for alpha, penalty, loss in parameter_combinations]
+
+        # Wait for all tasks to complete
+        concurrent.futures.wait(futures)
+
+
+def train_model(alpha_value: float, penalty: str, loss: str, config_path: str, enable_cache: bool):
+    logger.info(f"Training with alpha: {alpha_value}, penalty: {penalty}, loss: {loss}")
+
+    model = Model(
+        name="breast_cancer_classifier",
+        tags=[f"alpha: {alpha_value}", f"penalty: {penalty}", f"loss: {loss}"]
+    )
+    try:
+        logger.info(f"Starting training with alpha: {alpha_value}, penalty: {penalty}, loss: {loss}")
+        training.with_options(
+            config_path=config_path, enable_cache=enable_cache, model=model
+        )(
+            alpha_value=alpha_value, penalty=penalty, loss=loss
+        )
+
+        logger.info(f"Training finished successfully for alpha: {alpha_value}, penalty: {penalty}, loss: {loss}")
+    # except ValueError:
+    #     logger.info("Pipeline run aborted!\n\n")
+    #     pass
+    except Exception as e:
+        logger.error(f"Error in training with alpha: {alpha_value}, penalty: {penalty}, loss: {loss}")
+        logger.error(f"Exception: {str(e)}")
+        logger.error(f"Traceback: {traceback.format_exc()}")
+    else:
+        logger.info("Training pipeline finished successfully!\n\n")
 
 
 if __name__ == "__main__":