Skip to content

Commit 18d6fb1

Browse files
committed
Initial implementation
1 parent d3759d4 commit 18d6fb1

File tree

17 files changed

+1113
-0
lines changed

17 files changed

+1113
-0
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
.venv*
2+
.requirements*
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# environment configuration
2+
settings:
3+
docker:
4+
required_integrations:
5+
- sklearn
6+
- pandas
7+
requirements:
8+
- pyarrow
9+
- matplotlib
10+
- pillow
11+
- numpy
12+
13+
# configuration of the Model Control Plane
14+
model:
15+
name: breast_cancer_classifier
16+
license: Apache 2.0
17+
description: A breast cancer classifier
18+
tags: ["breast_cancer", "classifier"]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Apache Software License 2.0
2+
#
3+
# Copyright (c) ZenML GmbH 2024. All rights reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
from .feature_engineering import feature_engineering
19+
from .training import training
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# Apache Software License 2.0
2+
#
3+
# Copyright (c) ZenML GmbH 2024. All rights reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
from typing import List, Optional
19+
20+
from steps import (
21+
data_loader,
22+
data_preprocessor,
23+
data_splitter,
24+
)
25+
26+
from zenml import pipeline
27+
from zenml.logger import get_logger
28+
29+
logger = get_logger(__name__)
30+
31+
32+
@pipeline
33+
def feature_engineering(
34+
test_size: float = 0.2,
35+
drop_na: Optional[bool] = None,
36+
normalize: Optional[bool] = None,
37+
drop_columns: Optional[List[str]] = None,
38+
target: Optional[str] = "target",
39+
random_state: int = 17,
40+
):
41+
"""
42+
Feature engineering pipeline.
43+
44+
This is a pipeline that loads the data, processes it and splits
45+
it into train and test sets.
46+
47+
Args:
48+
test_size: Size of holdout set for training 0.0..1.0
49+
drop_na: If `True` NA values will be removed from dataset
50+
normalize: If `True` dataset will be normalized with MinMaxScaler
51+
drop_columns: List of columns to drop from dataset
52+
target: Name of target column in dataset
53+
random_state: Random state to configure the data loader
54+
55+
Returns:
56+
The processed datasets (dataset_trn, dataset_tst).
57+
"""
58+
# Link all the steps together by calling them and passing the output
59+
# of one step as the input of the next step.
60+
raw_data = data_loader(random_state=random_state, target=target)
61+
dataset_trn, dataset_tst = data_splitter(
62+
dataset=raw_data,
63+
test_size=test_size,
64+
)
65+
dataset_trn, dataset_tst, _ = data_preprocessor(
66+
dataset_trn=dataset_trn,
67+
dataset_tst=dataset_tst,
68+
drop_na=drop_na,
69+
normalize=normalize,
70+
drop_columns=drop_columns,
71+
target=target,
72+
random_state=random_state,
73+
)
74+
return dataset_trn, dataset_tst
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# Apache Software License 2.0
2+
#
3+
# Copyright (c) ZenML GmbH 2024. All rights reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
from typing import Optional
19+
from uuid import UUID
20+
21+
from steps import model_evaluator, model_promoter, model_trainer, model_grid_searcher
22+
23+
from pipelines import (
24+
feature_engineering,
25+
)
26+
from zenml import pipeline
27+
from zenml.client import Client
28+
from zenml.logger import get_logger
29+
30+
31+
logger = get_logger(__name__)
32+
33+
34+
@pipeline
35+
def training(
36+
train_dataset_id: Optional[UUID] = None,
37+
test_dataset_id: Optional[UUID] = None,
38+
target: Optional[str] = "target",
39+
):
40+
"""
41+
Model training pipeline.
42+
43+
This is a pipeline that loads the data from a preprocessing pipeline,
44+
trains a model on it and evaluates the model. If it is the first model
45+
to be trained, it will be promoted to production. If not, it will be
46+
promoted only if it has a higher accuracy than the current production
47+
model version.
48+
49+
Args:
50+
train_dataset_id: ID of the train dataset produced by feature engineering.
51+
test_dataset_id: ID of the test dataset produced by feature engineering.
52+
target: Name of target column in dataset.
53+
"""
54+
# Link all the steps together by calling them and passing the output
55+
# of one step as the input of the next step.
56+
57+
# Execute Feature Engineering Pipeline
58+
if train_dataset_id is None or test_dataset_id is None:
59+
dataset_trn, dataset_tst = feature_engineering()
60+
else:
61+
client = Client()
62+
dataset_trn = client.get_artifact_version(
63+
name_id_or_prefix=train_dataset_id
64+
)
65+
dataset_tst = client.get_artifact_version(
66+
name_id_or_prefix=test_dataset_id
67+
)
68+
69+
model, _, _ = model_grid_searcher(
70+
dataset_trn=dataset_trn, target=target
71+
)
72+
73+
acc, _ = model_evaluator(
74+
model=model,
75+
dataset_trn=dataset_trn,
76+
dataset_tst=dataset_tst,
77+
target=target,
78+
)
79+
80+
model_promoter(accuracy=acc)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
zenml[server]>=0.50.0
2+
notebook
3+
scikit-learn
4+
pyarrow
5+
pandas
6+
pillow
7+
matplotlib
8+
numpy

native-experiment-tracking/run.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Apache Software License 2.0
2+
#
3+
# Copyright (c) ZenML GmbH 2024. All rights reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
import os
18+
19+
import click
20+
21+
from zenml.client import Client
22+
from zenml.logger import get_logger
23+
24+
from pipelines import training
25+
26+
logger = get_logger(__name__)
27+
28+
@click.option(
29+
"--no-cache",
30+
is_flag=True,
31+
default=False,
32+
help="Disable caching for the pipeline run.",
33+
)
34+
def main(
35+
no_cache: bool = False,
36+
):
37+
"""Main entry point for the pipeline execution.
38+
39+
This entrypoint is where everything comes together:
40+
41+
* configuring pipeline with the required parameters
42+
(some of which may come from command line arguments, but most
43+
of which comes from the YAML config files)
44+
* launching the pipeline
45+
46+
Args:
47+
no_cache: If `True` cache will be disabled.
48+
"""
49+
client = Client()
50+
51+
config_folder = os.path.join(
52+
os.path.dirname(os.path.realpath(__file__)),
53+
"configs",
54+
)
55+
56+
pipeline_args = {}
57+
if no_cache:
58+
pipeline_args["enable_cache"] = False
59+
pipeline_args["config_path"] = os.path.join(
60+
config_folder, "training.yaml"
61+
)
62+
training.with_options(**pipeline_args)()
63+
training.with_options(**pipeline_args)()
64+
logger.info("Training pipeline with SGD finished successfully!\n\n")
65+
66+
if __name__ == "__main__":
67+
main()
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Apache Software License 2.0
2+
#
3+
# Copyright (c) ZenML GmbH 2024. All rights reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
from .data_loader import (
19+
data_loader,
20+
)
21+
from .data_preprocessor import (
22+
data_preprocessor,
23+
)
24+
from .data_splitter import (
25+
data_splitter,
26+
)
27+
from .model_evaluator import (
28+
model_evaluator,
29+
)
30+
from .model_promoter import (
31+
model_promoter,
32+
)
33+
from .model_trainer import (
34+
model_trainer,
35+
)
36+
from .model_grid_search import (
37+
model_grid_searcher
38+
)
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Apache Software License 2.0
2+
#
3+
# Copyright (c) ZenML GmbH 2024. All rights reserved.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
import pandas as pd
19+
from sklearn.datasets import load_breast_cancer
20+
from typing_extensions import Annotated
21+
22+
from zenml import step
23+
from zenml.logger import get_logger
24+
25+
logger = get_logger(__name__)
26+
27+
28+
@step
29+
def data_loader(
30+
random_state: int, is_inference: bool = False, target: str = "target"
31+
) -> Annotated[pd.DataFrame, "dataset"]:
32+
"""Dataset reader step.
33+
34+
This is an example of a dataset reader step that load Breast Cancer dataset.
35+
36+
This step is parameterized, which allows you to configure the step
37+
independently of the step code, before running it in a pipeline.
38+
In this example, the step can be configured with number of rows and logic
39+
to drop target column or not. See the documentation for more information:
40+
41+
https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters
42+
43+
Args:
44+
random_state: Random state for sampling
45+
is_inference: If `True` subset will be returned and target column
46+
will be removed from dataset.
47+
target: Name of target columns in dataset.
48+
49+
Returns:
50+
The dataset artifact as Pandas DataFrame and name of target column.
51+
"""
52+
dataset = load_breast_cancer(as_frame=True)
53+
inference_size = int(len(dataset.target) * 0.05)
54+
dataset: pd.DataFrame = dataset.frame
55+
inference_subset = dataset.sample(
56+
inference_size, random_state=random_state
57+
)
58+
if is_inference:
59+
dataset = inference_subset
60+
dataset.drop(columns=target, inplace=True)
61+
else:
62+
dataset.drop(inference_subset.index, inplace=True)
63+
dataset.reset_index(drop=True, inplace=True)
64+
logger.info(f"Dataset with {len(dataset)} records loaded!")
65+
return dataset

0 commit comments

Comments
 (0)