example-test-organization
diff --git a/‎samples/iris-sklearn/README.md
Lines changed: 50 additions & 0 deletions b/‎samples/iris-sklearn/README.md
Lines changed: 50 additions & 0 deletions
diff --git a/‎samples/iris-sklearn/iris-pipeline-compiled.py
Lines changed: 214 additions & 0 deletions b/‎samples/iris-sklearn/iris-pipeline-compiled.py
Lines changed: 214 additions & 0 deletions
@@ -0,0 +1,50 @@
+# Iris SKLearn Pipeline
+
+This pipeline is used to demonstrate a basic data science pipeline using the [Iris Dataset](https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html) using the sklearn library.
+
+## Prerequisites
+- Install [KFP Tekton prerequisites](/samples/README.md)
+- Install the additional requirements from [requirements.txt](./requirements.txt)
+
+## Instructions
+
+This sample provides two different ways to execute the pipeline. The first option is to compile the pipeline to a yaml Tekton pipeline, the second option is to connect directly to the Kubeflow Pipeline UI using the kfp TektonClient and run the pipeline directly.
+
+### Compiled
+
+The compiled pipeline uses the kfp-tekton `TektonCompiler()` to generate a yaml object.  The `TektonCompiler()` will produce a Tekton PipelineRun yaml object in the same directory called `iris-pipeline-compiled.yaml`.
+
+This pipeline does utilize a PVC in the pipeline and you may need to set a storage class as an environment variable to match one that is available on your cluster.
+
+To compile the pipeline run:
+
+```sh
+# Optional: Set the storage class for the pipeline
+export DEFAULT_STORAGE_CLASS="my-storage-class"
+
+python iris-pipeline-compiled.py
+```
+
+Once the pipeline is compiled, upload the `iris-pipeline-compiled.yaml` file to the Kubeflow Pipeline dashboard with Tekton Backend.  Once the pipeline is uploaded, you can create a new Pipeline Run from the Dashboard.
+
+### Direct Run
+
+The direct run uses the kfp-tekton `TektonClient()` to connect directly to Kubeflow and create a pipeline run.
+
+Like the compiled pipeline, this pipeline does utilize a PVC in the pipeline and you may need to set a storage class as an environment variable to match one that is available on your cluster.  This example also relies on some additional environment variables to set the Kubeflow UI endpoint and the users bearer token to authenticate to the UI.
+
+To execute the pipeline run:
+
+```sh
+# Optional: Set the storage class for the pipeline
+export DEFAULT_STORAGE_CLASS="my-storage-class"
+
+DS_PIPELINE_NAMESPACE="my-namespace"
+export KUBEFLOW_ENDPOINT="https://$(oc get route ds-pipeline-ui -n DS_PIPELINE_NAMESPACE -o jsonpath='{.spec.host}')"
+
+export BEARER_TOKEN=$(oc whoami --show-token)
+
+python iris-pipeline-direct-run.py
+```
+
+A pipeline run will automatically kick off in the UI.
@@ -0,0 +1,214 @@
+"""Example of a pipeline to demonstrate a simple data science workflow."""
+import os
+import urllib
+
+import kfp
+
+import kfp_tekton
+
+
+def data_prep(
+    X_train_file: kfp.components.OutputPath(),
+    X_test_file: kfp.components.OutputPath(),
+    y_train_file: kfp.components.OutputPath(),
+    y_test_file: kfp.components.OutputPath(),
+):
+    import pickle
+
+    import pandas as pd
+
+    from sklearn import datasets
+    from sklearn.model_selection import train_test_split
+
+    def get_iris_data() -> pd.DataFrame:
+        iris = datasets.load_iris()
+        data = pd.DataFrame(
+            {
+                "sepalLength": iris.data[:, 0],
+                "sepalWidth": iris.data[:, 1],
+                "petalLength": iris.data[:, 2],
+                "petalWidth": iris.data[:, 3],
+                "species": iris.target,
+            }
+        )
+
+        print("Initial Dataset:")
+        print(data.head())
+
+        return data
+
+    def create_training_set(dataset: pd.DataFrame, test_size: float = 0.3):
+        # Features
+        X = dataset[["sepalLength", "sepalWidth", "petalLength", "petalWidth"]]
+        # Labels
+        y = dataset["species"]
+
+        # Split dataset into training set and test set
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=test_size, random_state=11
+        )
+
+        return X_train, X_test, y_train, y_test
+
+    def save_pickle(object_file, target_object):
+        with open(object_file, "wb") as f:
+            pickle.dump(target_object, f)
+
+    dataset = get_iris_data()
+    X_train, X_test, y_train, y_test = create_training_set(dataset)
+
+    save_pickle(X_train_file, X_train)
+    save_pickle(X_test_file, X_test)
+    save_pickle(y_train_file, y_train)
+    save_pickle(y_test_file, y_test)
+
+
+def train_model(
+    X_train_file: kfp.components.InputPath(),
+    y_train_file: kfp.components.InputPath(),
+    model_file: kfp.components.OutputPath(),
+):
+    import pickle
+
+    from sklearn.ensemble import RandomForestClassifier
+
+    def load_pickle(object_file):
+        with open(object_file, "rb") as f:
+            target_object = pickle.load(f)
+
+        return target_object
+
+    def save_pickle(object_file, target_object):
+        with open(object_file, "wb") as f:
+            pickle.dump(target_object, f)
+
+    def train_iris(X_train: pd.DataFrame, y_train: pd.DataFrame):
+        model = RandomForestClassifier(n_estimators=100)
+        model.fit(X_train, y_train)
+
+        return model
+
+    X_train = load_pickle(X_train_file)
+    y_train = load_pickle(y_train_file)
+
+    model = train_iris(X_train, y_train)
+
+    save_pickle(model_file, model)
+
+
+def validate_model(model_file: kfp.components.InputPath()):
+    import pickle
+
+    def load_pickle(object_file):
+        with open(object_file, "rb") as f:
+            target_object = pickle.load(f)
+
+        return target_object
+
+    model = load_pickle(model_file)
+
+    input_values = [[5, 3, 1.6, 0.2]]
+
+    print(f"Performing test prediction on {input_values}")
+    result = model.predict(input_values)
+
+    print(f"Response: {result}")
+
+
+def evaluate_model(
+    X_test_file: kfp.components.InputPath(),
+    y_test_file: kfp.components.InputPath(),
+    model_file: kfp.components.InputPath(),
+    mlpipeline_metrics_file: kfp.components.OutputPath("Metrics"),
+):
+    import json
+    import pickle
+
+    from sklearn.metrics import accuracy_score
+
+    def load_pickle(object_file):
+        with open(object_file, "rb") as f:
+            target_object = pickle.load(f)
+
+        return target_object
+
+    X_test = load_pickle(X_test_file)
+    y_test = load_pickle(y_test_file)
+    model = load_pickle(model_file)
+
+    y_pred = model.predict(X_test)
+
+    accuracy_score_metric = accuracy_score(y_test, y_pred)
+    print(f"Accuracy: {accuracy_score_metric}")
+
+    metrics = {
+        "metrics": [
+            {
+                "name": "accuracy-score",
+                "numberValue": accuracy_score_metric,
+                "format": "PERCENTAGE",
+            },
+        ]
+    }
+
+    with open(mlpipeline_metrics_file, "w") as f:
+        json.dump(metrics, f)
+
+
+data_prep_op = kfp.components.create_component_from_func(
+    data_prep,
+    base_image="image-registry.openshift-image-registry.svc:5000/openshift/python:latest",
+    packages_to_install=["pandas", "scikit-learn"],
+)
+
+train_model_op = kfp.components.create_component_from_func(
+    train_model,
+    base_image="image-registry.openshift-image-registry.svc:5000/openshift/python:latest",
+    packages_to_install=["pandas", "scikit-learn"],
+)
+
+evaluate_model_op = kfp.components.create_component_from_func(
+    evaluate_model,
+    base_image="image-registry.openshift-image-registry.svc:5000/openshift/python:latest",
+    packages_to_install=["pandas", "scikit-learn"],
+)
+
+validate_model_op = kfp.components.create_component_from_func(
+    validate_model,
+    base_image="image-registry.openshift-image-registry.svc:5000/openshift/python:latest",
+    packages_to_install=["pandas", "scikit-learn"],
+)
+
+
+@kfp.dsl.pipeline(
+    name="Iris Pipeline",
+)
+def iris_pipeline(model_obc: str = "iris-model"):
+    data_prep_task = data_prep_op()
+
+    train_model_task = train_model_op(
+        data_prep_task.outputs["X_train"],
+        data_prep_task.outputs["y_train"],
+    )
+
+    evaluate_model_task = evaluate_model_op(  # noqa: F841
+        data_prep_task.outputs["X_test"],
+        data_prep_task.outputs["y_test"],
+        train_model_task.output,
+    )
+
+    validate_model_task = validate_model_op(train_model_task.output)  # noqa: F841
+
+
+if __name__ == "__main__":
+    # set the default storage class and mode if they don't already exists
+    os.environ["DEFAULT_STORAGE_CLASS"] = os.environ.get(
+        "DEFAULT_STORAGE_CLASS", "ocs-storagecluster-ceph-rbd"
+    )
+    os.environ["DEFAULT_ACCESSMODES"] = os.environ.get(
+        "DEFAULT_ACCESSMODES", "ReadWriteOnce"
+    )
+
+    kfp_tekton.compiler.TektonCompiler().compile(
+        iris_pipeline, __file__.replace(".py", ".yaml")
+    )