zenml-io
diff --git a/‎native-experiment-tracking/.dockerignore‎
Lines changed: 2 additions & 0 deletions b/‎native-experiment-tracking/.dockerignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎native-experiment-tracking/configs/training.yaml‎
Lines changed: 18 additions & 0 deletions b/‎native-experiment-tracking/configs/training.yaml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎native-experiment-tracking/pipelines/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎native-experiment-tracking/pipelines/__init__.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎native-experiment-tracking/pipelines/feature_engineering.py‎
Lines changed: 74 additions & 0 deletions b/‎native-experiment-tracking/pipelines/feature_engineering.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎native-experiment-tracking/pipelines/training.py‎
Lines changed: 80 additions & 0 deletions b/‎native-experiment-tracking/pipelines/training.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎native-experiment-tracking/requirements.txt‎
Lines changed: 8 additions & 0 deletions b/‎native-experiment-tracking/requirements.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎native-experiment-tracking/run.py‎
Lines changed: 67 additions & 0 deletions b/‎native-experiment-tracking/run.py‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎native-experiment-tracking/steps/__init__.py‎
Lines changed: 38 additions & 0 deletions b/‎native-experiment-tracking/steps/__init__.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎native-experiment-tracking/steps/data_loader.py‎
Lines changed: 65 additions & 0 deletions b/‎native-experiment-tracking/steps/data_loader.py‎
Lines changed: 65 additions & 0 deletions
@@ -0,0 +1,2 @@
+.venv*
+.requirements*
@@ -0,0 +1,18 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+      - pandas
+    requirements:
+      - pyarrow
+      - matplotlib
+      - pillow
+      - numpy
+
+# configuration of the Model Control Plane
+model:
+  name: breast_cancer_classifier
+  license: Apache 2.0
+  description: A breast cancer classifier
+  tags: ["breast_cancer", "classifier"]
@@ -0,0 +1,19 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .feature_engineering import feature_engineering
+from .training import training
@@ -0,0 +1,74 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional
+
+from steps import (
+    data_loader,
+    data_preprocessor,
+    data_splitter,
+)
+
+from zenml import pipeline
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def feature_engineering(
+    test_size: float = 0.2,
+    drop_na: Optional[bool] = None,
+    normalize: Optional[bool] = None,
+    drop_columns: Optional[List[str]] = None,
+    target: Optional[str] = "target",
+    random_state: int = 17,
+):
+    """
+    Feature engineering pipeline.
+
+    This is a pipeline that loads the data, processes it and splits
+    it into train and test sets.
+
+    Args:
+        test_size: Size of holdout set for training 0.0..1.0
+        drop_na: If `True` NA values will be removed from dataset
+        normalize: If `True` dataset will be normalized with MinMaxScaler
+        drop_columns: List of columns to drop from dataset
+        target: Name of target column in dataset
+        random_state: Random state to configure the data loader
+
+    Returns:
+        The processed datasets (dataset_trn, dataset_tst).
+    """
+    # Link all the steps together by calling them and passing the output
+    # of one step as the input of the next step.
+    raw_data = data_loader(random_state=random_state, target=target)
+    dataset_trn, dataset_tst = data_splitter(
+        dataset=raw_data,
+        test_size=test_size,
+    )
+    dataset_trn, dataset_tst, _ = data_preprocessor(
+        dataset_trn=dataset_trn,
+        dataset_tst=dataset_tst,
+        drop_na=drop_na,
+        normalize=normalize,
+        drop_columns=drop_columns,
+        target=target,
+        random_state=random_state,
+    )
+    return dataset_trn, dataset_tst
@@ -0,0 +1,80 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional
+from uuid import UUID
+
+from steps import model_evaluator, model_promoter, model_trainer, model_grid_searcher
+
+from pipelines import (
+    feature_engineering,
+)
+from zenml import pipeline
+from zenml.client import Client
+from zenml.logger import get_logger
+
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def training(
+    train_dataset_id: Optional[UUID] = None,
+    test_dataset_id: Optional[UUID] = None,
+    target: Optional[str] = "target",
+):
+    """
+    Model training pipeline.
+
+    This is a pipeline that loads the data from a preprocessing pipeline,
+    trains a model on it and evaluates the model. If it is the first model
+    to be trained, it will be promoted to production. If not, it will be
+    promoted only if it has a higher accuracy than the current production
+    model version.
+
+    Args:
+        train_dataset_id: ID of the train dataset produced by feature engineering.
+        test_dataset_id: ID of the test dataset produced by feature engineering.
+        target: Name of target column in dataset.
+    """
+    # Link all the steps together by calling them and passing the output
+    # of one step as the input of the next step.
+
+    # Execute Feature Engineering Pipeline
+    if train_dataset_id is None or test_dataset_id is None:
+        dataset_trn, dataset_tst = feature_engineering()
+    else:
+        client = Client()
+        dataset_trn = client.get_artifact_version(
+            name_id_or_prefix=train_dataset_id
+        )
+        dataset_tst = client.get_artifact_version(
+            name_id_or_prefix=test_dataset_id
+        )
+
+    model, _, _ = model_grid_searcher(
+        dataset_trn=dataset_trn, target=target
+    )
+
+    acc, _ = model_evaluator(
+        model=model,
+        dataset_trn=dataset_trn,
+        dataset_tst=dataset_tst,
+        target=target,
+    )
+
+    model_promoter(accuracy=acc)
@@ -0,0 +1,8 @@
+zenml[server]>=0.50.0
+notebook
+scikit-learn
+pyarrow
+pandas
+pillow
+matplotlib
+numpy
@@ -0,0 +1,67 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import os
+
+import click
+
+from zenml.client import Client
+from zenml.logger import get_logger
+
+from pipelines import training
+
+logger = get_logger(__name__)
+
+@click.option(
+    "--no-cache",
+    is_flag=True,
+    default=False,
+    help="Disable caching for the pipeline run.",
+)
+def main(
+    no_cache: bool = False,
+):
+    """Main entry point for the pipeline execution.
+
+    This entrypoint is where everything comes together:
+
+      * configuring pipeline with the required parameters
+        (some of which may come from command line arguments, but most
+        of which comes from the YAML config files)
+      * launching the pipeline
+
+    Args:
+        no_cache: If `True` cache will be disabled.
+    """
+    client = Client()
+
+    config_folder = os.path.join(
+        os.path.dirname(os.path.realpath(__file__)),
+        "configs",
+    )
+
+    pipeline_args = {}
+    if no_cache:
+        pipeline_args["enable_cache"] = False
+    pipeline_args["config_path"] = os.path.join(
+        config_folder, "training.yaml"
+    )
+    training.with_options(**pipeline_args)()
+    training.with_options(**pipeline_args)()
+    logger.info("Training pipeline with SGD finished successfully!\n\n")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,38 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .data_loader import (
+    data_loader,
+)
+from .data_preprocessor import (
+    data_preprocessor,
+)
+from .data_splitter import (
+    data_splitter,
+)
+from .model_evaluator import (
+    model_evaluator,
+)
+from .model_promoter import (
+    model_promoter,
+)
+from .model_trainer import (
+    model_trainer,
+)
+from .model_grid_search import (
+    model_grid_searcher
+)
@@ -0,0 +1,65 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pandas as pd
+from sklearn.datasets import load_breast_cancer
+from typing_extensions import Annotated
+
+from zenml import step
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@step
+def data_loader(
+    random_state: int, is_inference: bool = False, target: str = "target"
+) -> Annotated[pd.DataFrame, "dataset"]:
+    """Dataset reader step.
+
+    This is an example of a dataset reader step that load Breast Cancer dataset.
+
+    This step is parameterized, which allows you to configure the step
+    independently of the step code, before running it in a pipeline.
+    In this example, the step can be configured with number of rows and logic
+    to drop target column or not. See the documentation for more information:
+
+        https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters
+
+    Args:
+        random_state: Random state for sampling
+        is_inference: If `True` subset will be returned and target column
+            will be removed from dataset.
+        target: Name of target columns in dataset.
+
+    Returns:
+        The dataset artifact as Pandas DataFrame and name of target column.
+    """
+    dataset = load_breast_cancer(as_frame=True)
+    inference_size = int(len(dataset.target) * 0.05)
+    dataset: pd.DataFrame = dataset.frame
+    inference_subset = dataset.sample(
+        inference_size, random_state=random_state
+    )
+    if is_inference:
+        dataset = inference_subset
+        dataset.drop(columns=target, inplace=True)
+    else:
+        dataset.drop(inference_subset.index, inplace=True)
+    dataset.reset_index(drop=True, inplace=True)
+    logger.info(f"Dataset with {len(dataset)} records loaded!")
+    return dataset