add demo (must be deleted)

safoinme · safoinme · commit 41518e05b70e · 2025-01-28T16:10:57.000+01:00
diff --git a/demo/README.md b/demo/README.md
@@ -0,0 +1,70 @@
+# ZenML Implementation Guide
+
+## Overview
+This guide outlines the step-by-step process for setting up and running the demonstrated ZenML pipeline with Neptune experiment tracking integration. The implementation follows a systematic approach to ensure reproducible machine learning workflows.
+
+## Prerequisites
+- Python 3.9 or higher
+- Access to Neptune.ai account
+- ZenML cloud account
+
+## Installation and Setup Process
+
+### 1. Environment Setup
+First, create and activate a dedicated virtual environment:
+
+```bash
+# Create virtual environment
+python -m venv .venv
+
+# Activate virtual environment
+# For Unix/MacOS
+source .venv/bin/activate
+```
+
+### 2. Dependencies Installation
+Install required packages from the requirements file:
+
+```bash
+pip install -r requirements.txt
+```
+
+### 3. ZenML Configuration
+Initialize and configure ZenML with the following steps:
+
+```bash
+# Initialize ZenML in your project directory
+zenml init
+zenml integration install pytorch_lightning neptune
+
+# Connect to ZenML cloud tenant (you can find this command in the overview page of your ZenML cloud tenant)
+zenml login 8a462fb6-b...
+
+# Register Neptune experiment tracker
+zenml experiment-tracker register neptune_experiment_tracker \
+    --flavor=neptune \
+    --project="" \
+    --api_token=""
+
+# Register and configure stack
+zenml stack register neptune_stack \
+    -o default \
+    -a default \
+    -e neptune_experiment_tracker
+
+# Set as active stack
+zenml stack set neptune_stack
+```
+
+### 4. Execute Pipeline
+Run the implementation:
+
+```bash
+python run.py
+```
+
+## Troubleshooting
+- Ensure all environment variables are properly set
+- Verify Neptune.ai credentials are correctly configured
+- Check ZenML stack status using `zenml stack list`
+
diff --git a/demo/configs/config.yaml b/demo/configs/config.yaml
@@ -0,0 +1,33 @@
+model:
+  name: cifar10_resnet18
+  description: "Fine-tune with ResNet18 on CIFAR10 using PyTorch Lightning and Neptune in GCP"
+  tags:
+    - pytorch_lightning
+    - demo
+    - neptune
+    - cifar10
+    - gcp
+
+settings:
+  docker:
+    #parent_image: pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime
+    python_package_installer: uv
+    required_integrations:
+      - pytorch
+      - neptune
+      - gcp
+      - pytorch_lightning
+    requirements:
+      - torchvision
+      - lightning
+      #- zenml==0.73.0
+
+parameters:
+  # Data parameters
+  batch_size: 256
+  val_split: 0.2
+  dataset_fraction: 0.05  # Use only 10% of the data for faster demo
+  
+  # Training parameters
+  epochs: 2
+  learning_rate: 0.04
diff --git a/demo/pipelines/cifar10_pipeline.py b/demo/pipelines/cifar10_pipeline.py
@@ -0,0 +1,71 @@
+from typing import Dict
+
+from steps.data_loader import load_cifar10_data
+from steps.evaluator import evaluate_model
+from steps.trainer import train_model
+
+from zenml import pipeline
+from zenml.config import DockerSettings
+from zenml.config.resource_settings import ResourceSettings
+from zenml.integrations.constants import PYTORCH
+from zenml.integrations.gcp.flavors.vertex_orchestrator_flavor import (
+    VertexOrchestratorSettings,
+)
+
+vertex_settings = VertexOrchestratorSettings(
+    pod_settings={
+        "node_selectors": {
+            "cloud.google.com/gke-accelerator": "NVIDIA_TESLA_V100",
+        },
+    }
+)
+#resource_settings = ResourceSettings(gpu_count=1)
+resource_settings = ResourceSettings(cpu_count=16, memory="32GB")
+@pipeline(
+    settings={
+        #"orchestrator": vertex_settings,
+        "resources": resource_settings,
+    },
+    enable_cache=True
+)
+def cifar10_pipeline(
+    batch_size: int = 256,
+    val_split: float = 0.2,
+    dataset_fraction: float = 0.05,  # Control dataset size
+    epochs: int = 5,
+    learning_rate: float = 0.05
+) -> Dict[str, float]:
+    """Training pipeline for CIFAR10 image classification.
+    
+    Args:
+        batch_size: The batch size for training and evaluation.
+        val_split: The fraction of the dataset to use for validation.
+        dataset_fraction: The fraction of total dataset to use (for faster demo).
+        epochs: The number of epochs to train the model.
+        learning_rate: The learning rate for the optimizer.
+    
+    Returns:
+        A dictionary containing the test loss and accuracy.
+    """
+    # Load and prepare data
+    train_dataloader, val_dataloader, test_dataloader = load_cifar10_data(
+        batch_size=batch_size,
+        val_split=val_split,
+        dataset_fraction=dataset_fraction
+    )
+    
+    # Train model
+    model = train_model(
+        train_dataloader=train_dataloader,
+        val_dataloader=val_dataloader,
+        epochs=epochs,
+        lr=learning_rate,
+    )
+    
+    # Evaluate model
+    metrics = evaluate_model(
+        model=model,
+        test_dataloader=test_dataloader
+    )
+    
+    return metrics
diff --git a/demo/requirements.txt b/demo/requirements.txt
@@ -0,0 +1,8 @@
+torch
+torchvision
+torchmetrics
+zenml
+click
+pyyaml
+torchvision
+lightning
diff --git a/demo/run.py b/demo/run.py
@@ -0,0 +1,93 @@
+import os
+from typing import Optional
+
+import click
+import yaml
+from pipelines.cifar10_pipeline import cifar10_pipeline
+
+from zenml.client import Client
+from zenml.config.schedule import Schedule
+from zenml.integrations.neptune.experiment_trackers import (
+    NeptuneExperimentTracker,
+)
+
+
+@click.command(
+    help="""
+ZenML CIFAR10 Training Demo CLI.
+
+Run the ZenML CIFAR10 image classification training pipeline.
+
+Examples:
+
+  \b
+  # Run the pipeline with default config
+    python run.py
+  
+  \b
+  # Run the pipeline with custom config
+    python run.py --config custom_config.yaml
+
+  \b
+  # Run without caching
+    python run.py --no-cache
+"""
+)
+@click.option(
+    "--config-path",
+    type=str,
+    default="configs/config.yaml",
+    help="Path to the YAML config file.",
+)
+@click.option(
+    "--no-cache",
+    is_flag=True,
+    default=False,
+    help="Disable caching for the pipeline run.",
+)
+def main(config_path: Optional[str] = None, no_cache: bool = False) -> None:
+    """Main entry point for the pipeline execution.
+
+    Args:
+        config: Path to the YAML config file.
+        no_cache: If True, disable caching.
+    """
+    if not config_path:
+        raise RuntimeError("Config file is required to run the pipeline.")
+
+    # Ensure config path is absolute
+    if not os.path.isabs(config_path):
+        config_path = os.path.join(
+            os.path.dirname(os.path.realpath(__file__)),
+            config_path
+        )
+
+    # Load configuration
+    with open(config_path, "r") as f:
+        config_dict = yaml.safe_load(f)
+    
+    # Ensure neptune experiment tracker is active
+    stack = Client().active_stack
+    if not isinstance(stack.experiment_tracker, NeptuneExperimentTracker):
+        raise RuntimeError(
+            "This pipeline requires an Neptune experiment tracker in the active stack. "
+            "Please run: zenml experiment-tracker register neptune"
+        )
+    
+    # Run the pipeline
+    pipeline_args = {"enable_cache": not no_cache}
+    pipeline_args["config_path"] = config_path
+    metrics = cifar10_pipeline.with_options(**pipeline_args,)(
+        batch_size=config_dict["parameters"]["batch_size"],
+        val_split=config_dict["parameters"]["val_split"],
+        dataset_fraction=config_dict["parameters"]["dataset_fraction"],
+        epochs=config_dict["parameters"]["epochs"],
+        learning_rate=config_dict["parameters"]["learning_rate"],
+    )
+    
+    click.echo("Training completed!")
+    click.echo(f"Test metrics: {metrics}")
+
+
+if __name__ == "__main__":
+    main() 
diff --git a/demo/steps/data_loader.py b/demo/steps/data_loader.py
@@ -0,0 +1,114 @@
+import os
+import numpy as np
+import torch
+import torchvision
+from typing import Tuple, Annotated, List
+from torch.utils.data import DataLoader, random_split, Subset
+from zenml import step
+
+# Constants
+PATH_DATASETS = os.environ.get("PATH_DATASETS", ".")
+BATCH_SIZE = 256 if torch.cuda.is_available() else 64
+NUM_WORKERS = int(os.cpu_count() / 2) if os.cpu_count() else 2
+
+# Data normalization
+cifar10_normalization = torchvision.transforms.Normalize(
+    mean=[x / 255.0 for x in [125.3, 123.0, 113.9]],
+    std=[x / 255.0 for x in [63.0, 62.1, 66.7]],
+)
+
+train_transforms = torchvision.transforms.Compose([
+    torchvision.transforms.RandomCrop(32, padding=4),
+    torchvision.transforms.RandomHorizontalFlip(),
+    torchvision.transforms.ToTensor(),
+    cifar10_normalization,
+])
+
+test_transforms = torchvision.transforms.Compose([
+    torchvision.transforms.ToTensor(),
+    cifar10_normalization,
+])
+
+def get_subset_indices(total_size: int, fraction: float) -> List[int]:
+    """Get random indices for subset of data.
+    
+    Args:
+        total_size: Total size of the dataset
+        fraction: Fraction of data to use
+        
+    Returns:
+        List of indices for the subset
+    """
+    num_samples = int(total_size * fraction)
+    indices = np.random.permutation(total_size)[:num_samples].tolist()
+    return indices
+
+@step
+def load_cifar10_data(
+    batch_size: int = BATCH_SIZE,
+    val_split: float = 0.2,
+    dataset_fraction: float = 0.05  # Use only 20% of the data by default
+) -> Tuple[
+    Annotated[DataLoader, "train_dataloader"],
+    Annotated[DataLoader, "val_dataloader"],
+    Annotated[DataLoader, "test_dataloader"]
+]:
+    """Load and prepare CIFAR10 datasets.
+    
+    Args:
+        batch_size: Batch size for the dataloaders
+        val_split: Fraction of training data to use for validation
+        dataset_fraction: Fraction of total dataset to use (for faster demo)
+    """
+    # Set random seed for reproducibility
+    np.random.seed(42)
+    
+    # Load full datasets
+    dataset_train_full = torchvision.datasets.CIFAR10(PATH_DATASETS, train=True, download=True, transform=train_transforms)
+    dataset_test_full = torchvision.datasets.CIFAR10(PATH_DATASETS, train=False, download=True, transform=test_transforms)
+    
+    # Get subset indices
+    train_indices = get_subset_indices(len(dataset_train_full), dataset_fraction)
+    test_indices = get_subset_indices(len(dataset_test_full), dataset_fraction)
+    # Create subsets
+    dataset_train = Subset(dataset_train_full, train_indices)
+    dataset_test = Subset(dataset_test_full, test_indices)
+    
+    # Split training into train and validation
+    train_length = int(len(dataset_train) * (1 - val_split))
+    val_length = len(dataset_train) - train_length
+    dataset_train, dataset_val = random_split(
+        dataset_train, 
+        [train_length, val_length],
+        generator=torch.Generator().manual_seed(42)
+    )
+    
+    print(f"Dataset sizes:")
+    print(f"Original training set: {len(dataset_train_full)} samples")
+    print(f"Original test set: {len(dataset_test_full)} samples")
+    print(f"After {dataset_fraction*100:.1f}% subset:")
+    print(f"  Training: {len(dataset_train)} samples")
+    print(f"  Validation: {len(dataset_val)} samples")
+    print(f"  Test: {len(dataset_test)} samples")
+    
+    # Create dataloaders
+    train_dataloader = DataLoader(
+        dataset_train,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=NUM_WORKERS
+    )
+    val_dataloader = DataLoader(
+        dataset_val,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=NUM_WORKERS
+    )
+    test_dataloader = DataLoader(
+        dataset_test,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=NUM_WORKERS
+    )
+    
+    return train_dataloader, val_dataloader, test_dataloader 
diff --git a/demo/steps/evaluator.py b/demo/steps/evaluator.py
diff --git a/demo/steps/model.py b/demo/steps/model.py
diff --git a/demo/steps/trainer.py b/demo/steps/trainer.py