Use training data from a managed datasource (#134)

eedorenko · jotaylo · commit da60ab116083 · 2020-01-13T09:36:24.000-08:00
* Data from managed datastore

* merge with unassigned variable fix

* bugfix

* typo

* linting

* linting

* linting

* added a link to az cli

* doc update

* reregistering a dataset

* typo

* rephrasing

* rephrasing

* auth enabled

* revert auth enabled
diff --git a/.env.example b/.env.example
@@ -31,6 +31,9 @@ MODEL_PATH = ''
 EVALUATE_SCRIPT_PATH = 'evaluate/evaluate_model.py'
 REGISTER_SCRIPT_PATH = 'register/register_model.py'
 SOURCES_DIR_TRAIN = 'code'
+DATASET_NAME = 'diabetes_ds'
+DATASTORE_NAME = 'datablobstore'
+DATAFILE_NAME = 'diabetes.csv'
 
 # Optional. Used by a training pipeline with R on Databricks
 DB_CLUSTER_ID = ''
diff --git a/.pipelines/azdo-variables.yml b/.pipelines/azdo-variables.yml
@@ -39,4 +39,6 @@ variables:
 - name: DB_CLUSTER_ID
   value: ''
 - name: SCORE_SCRIPT
-  value: score.py
+  value: score.py
+- name: DATASET_NAME
+  value: diabetes_ds
diff --git a/code/training/train.py b/code/training/train.py
@@ -24,6 +24,7 @@
 POSSIBILITY OF SUCH DAMAGE.
 """
 from azureml.core.run import Run
+from azureml.core import Dataset
 import os
 import argparse
 from sklearn.datasets import load_diabetes
@@ -69,19 +70,34 @@ def main():
               "must be a positive float.")
     )
 
+    parser.add_argument(
+        "--dataset_name",
+        type=str,
+        help=("Dataset with the training data")
+    )
     args = parser.parse_args()
 
     print("Argument [build_id]: %s" % args.build_id)
     print("Argument [model_name]: %s" % args.model_name)
     print("Argument [alpha]: %s" % args.alpha)
+    print("Argument [dataset_name]: %s" % args.dataset_name)
 
     model_name = args.model_name
     build_id = args.build_id
     alpha = args.alpha
+    dataset_name = args.dataset_name
 
     run = Run.get_context()
+    ws = run.experiment.workspace
+
+    if (dataset_name):
+        dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)
+        df = dataset.to_pandas_dataframe()
+        X = df.values
+        y = df.Y
+    else:
+        X, y = load_diabetes(return_X_y=True)
 
-    X, y = load_diabetes(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size=0.2, random_state=0)
     data = {"train": {"X": X_train, "y": y_train},
diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -122,6 +122,10 @@ Check out the newly created resources in the [Azure Portal](portal.azure.com):
 
 (Optional) To remove the resources created for this project you can use the [/environment_setup/iac-remove-environment.yml](../environment_setup/iac-remove-environment.yml) definition or you can just delete the resource group in the [Azure Portal](portal.azure.com).
 
+**Note:** The training ML pipeline uses a [sample diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html) as training data. If you want to use your own dataset, you need to [create and register a datastore](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-access-data#azure-machine-learning-studio) in your ML workspace and upload the datafile (e.g. [diabetes.csv](./data/diabetes.csv)) to the corresponding blob container. You can also define a datastore in the ML Workspace with [az cli](https://docs.microsoft.com/en-us/cli/azure/ext/azure-cli-ml/ml/datastore?view=azure-cli-latest#ext-azure-cli-ml-az-ml-datastore-attach-blob). 
+You'll also need to configure DATASTORE_NAME and DATAFILE_NAME variables in ***devopsforai-aml-vg*** variable group. 
+
+
 ## Create an Azure DevOps Azure ML Workspace Service Connection
 Install the **Azure Machine Learning** extension to your organization from the
 [marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml),
diff --git a/ml_service/pipelines/build_train_pipeline.py b/ml_service/pipelines/build_train_pipeline.py
@@ -3,6 +3,7 @@
 from azureml.pipeline.core import Pipeline
 from azureml.core import Workspace
 from azureml.core.runconfig import RunConfiguration, CondaDependencies
+from azureml.core import Dataset, Datastore
 import os
 import sys
 sys.path.append(os.path.abspath("./ml_service/util"))  # NOQA: E402
@@ -35,10 +36,10 @@ def main():
                         'scikit-learn', 'tensorflow', 'keras'],
         pip_packages=['azure', 'azureml-core',
                       'azure-storage',
-                      'azure-storage-blob'])
+                      'azure-storage-blob',
+                      'azureml-dataprep'])
     )
     run_config.environment.docker.enabled = True
-
     config_envvar = {}
     if (e.collection_uri is not None and e.teamproject_name is not None):
         builduri_base = e.collection_uri + e.teamproject_name
@@ -53,6 +54,17 @@ def main():
     hyperparameter_alpha_param = PipelineParameter(
         name="hyperparameter_alpha", default_value=0.5)
 
+    dataset_name = ""
+    if (e.datastore_name is not None and e.datafile_name is not None):
+        dataset_name = e.dataset_name
+        datastore = Datastore.get(aml_workspace, e.datastore_name)
+        data_path = [(datastore, e.datafile_name)]
+        dataset = Dataset.Tabular.from_delimited_files(path=data_path)
+        dataset.register(workspace=aml_workspace,
+                         name=e.dataset_name,
+                         description="dataset with training data",
+                         create_new_version=True)
+
     train_step = PythonScriptStep(
         name="Train Model",
         script_name=e.train_script_path,
@@ -62,6 +74,7 @@ def main():
             "--build_id", build_id_param,
             "--model_name", model_name_param,
             "--alpha", hyperparameter_alpha_param,
+            "--dataset_name", dataset_name,
         ],
         runconfig=run_config,
         allow_reuse=False,
diff --git a/ml_service/util/env_variables.py b/ml_service/util/env_variables.py
@@ -41,6 +41,9 @@ def __init__(self):
         self._score_script = os.environ.get("SCORE_SCRIPT")
         self._collection_uri = os.environ.get("SYSTEM_COLLECTIONURI")
         self._teamproject_name = os.environ.get("SYSTEM_TEAMPROJECT")
+        self._datastore_name = os.environ.get("DATASTORE_NAME")
+        self._datafile_name = os.environ.get("DATAFILE_NAME")
+        self._dataset_name = os.environ.get("DATASET_NAME")
 
     @property
     def workspace_name(self):
@@ -145,3 +148,15 @@ def collection_uri(self):
     @property
     def teamproject_name(self):
         return self._teamproject_name
+
+    @property
+    def datastore_name(self):
+        return self._datastore_name
+
+    @property
+    def datafile_name(self):
+        return self._datafile_name
+
+    @property
+    def dataset_name(self):
+        return self._dataset_name