Training a model with R on ML Compute and Databricks (#77)

eedorenko · dtzar · commit d0a1e5b283f8 · 2019-10-15T09:43:12.000-07:00
diff --git a/.env.example b/.env.example
@@ -45,4 +45,8 @@ IMAGE_VERSION = ''
 # ACI Config
 ACI_CPU_CORES = ''
 ACI_MEM_GB = ''
-ACI_DESCRIPTION = ''
+ACI_DESCRIPTION = ''
+
+# Optional. Used by a training pipeline with R on Databricks
+DB_CLUSTER_ID = ''
+DATABRICKS_COMPUTE_NAME = ''
diff --git a/.pipelines/azdo-ci-build-train.yml b/.pipelines/azdo-ci-build-train.yml
@@ -15,17 +15,36 @@ variables:
 
 
 steps:
+
 - template: azdo-base-pipeline.yml
-    
+
 - bash: |
-   # Invoke the Python building and publishing a training pipeline
+   # Invoke the Python building and publishing a training pipeline with Python on ML Compute
    python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline.py
   failOnStderr: 'false'
   env:
     SP_APP_SECRET: '$(SP_APP_SECRET)'
-  displayName: 'Publish Azure Machine Learning Pipeline'
+  displayName: 'Publish Azure Machine Learning Pipeline. Python on ML'
   enabled: 'true'
 
+- bash: |
+   # Invoke the Python building and publishing a training pipeline with R on ML Compute
+   python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline_with_r.py
+  failOnStderr: 'false'
+  env:
+    SP_APP_SECRET: '$(SP_APP_SECRET)'
+  displayName: 'Publish Azure Machine Learning Pipeline. R on ML Compute'
+  enabled: 'false'
+
+- bash: |
+   # Invoke the Python building and publishing a training pipeline with R on DataBricks
+   python3 $(Build.SourcesDirectory)/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py
+  failOnStderr: 'false'
+  env:
+    SP_APP_SECRET: '$(SP_APP_SECRET)'
+  displayName: 'Publish Azure Machine Learning Pipeline. R on DataBricks'
+  enabled: 'false'
+
 - task: CopyFiles@2
   displayName: 'Copy Files to: $(Build.ArtifactStagingDirectory)'
   inputs:
diff --git a/code/training/R/r_train.r b/code/training/R/r_train.r
@@ -0,0 +1,41 @@
+print(R.version.string)
+
+# COMMAND ----------
+
+path="weight_data.csv"
+print(paste("Reading file from",path))
+
+routes<-read.csv(path, header=TRUE)
+
+# The predictor vector (height).
+x <- routes$height
+# The response vector (weight).
+y <- routes$weight
+# Apply the lm() function.
+model <- lm(y~x)
+
+# COMMAND ----------
+
+routes
+
+# COMMAND ----------
+
+# Make Predictions
+df_test_heights <- data.frame(x = as.numeric(c(115,20)))
+result <-  predict(model,df_test_heights)
+print(result)
+
+# COMMAND ----------
+
+# Save the model to blob storage
+model_path="model.rds"
+saveRDS(model, model_path)
+
+# COMMAND ----------
+
+# View model details
+print(model)
+
+# COMMAND ----------
+
+print('Completed')
diff --git a/code/training/R/train_with_r.py b/code/training/R/train_with_r.py
@@ -0,0 +1,3 @@
+import os
+
+os.system("Rscript r_train.r && ls -ltr model.rds")
diff --git a/code/training/R/train_with_r_on_databricks.py b/code/training/R/train_with_r_on_databricks.py
@@ -0,0 +1,15 @@
+import os
+import argparse
+
+parser = argparse.ArgumentParser("train")
+parser.add_argument(
+    "--AZUREML_SCRIPT_DIRECTORY_NAME",
+    type=str,
+    help="folder",
+)
+
+args, unknown = parser.parse_known_args()
+folder = args.AZUREML_SCRIPT_DIRECTORY_NAME
+
+os.system("cd " + "/dbfs/" + folder +
+          " && Rscript r_train.r && ls -ltr model.rds")
diff --git a/code/training/R/weight_data.csv b/code/training/R/weight_data.csv
@@ -0,0 +1,30 @@
+height,weight
+79,174
+63,250
+75,223
+75,130
+70,120
+76,239
+63,129
+64,185
+59,246
+80,241
+79,217
+65,212
+74,242
+71,223
+61,167
+78,148
+75,229
+75,116
+75,182
+72,237
+72,160
+79,169
+67,219
+61,202
+65,168
+79,181
+81,214
+78,216
+59,245
diff --git a/docs/code_description.md b/docs/code_description.md
@@ -20,15 +20,21 @@
 
 ### ML Services
 
-- `ml_service/pipelines/build_train_pipeline.py` : builds and publishes an ML training pipeline.
-- `ml_service/pipelines/run_train_pipeline.py` : invokes a published ML training pipeline via REST API.
+- `ml_service/pipelines/build_train_pipeline.py` : builds and publishes an ML training pipeline. It uses Python on ML Compute.
+- `ml_service/pipelines/build_train_pipeline_with_r.py` : builds and publishes an ML training pipeline. It uses R on ML Compute.
+- `ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py` : builds and publishes an ML training pipeline. It uses R on Databricks Compute.
+- `ml_service/pipelines/run_train_pipeline.py` : invokes a published ML training pipeline (Python on ML Compute) via REST API.
 - `ml_service/util` : contains common utility functions used to build and publish an ML training pipeline.
 
 ### Code
 
 - `code/training/train.py` : a training step of an ML training pipeline.
 - `code/evaluate/evaluate_model.py` : an evaluating step of an ML training pipeline which registers a new trained model if evaluation shows the new model is more performant than the previous one.
 - `code/evaluate/register_model.py` : (LEGACY) registers a new trained model if evaluation shows the new model is more performant than the previous one.
+- `code/training/R/r_train.r` : training a model with R basing on a sample dataset (weight_data.csv).
+- `code/training/R/train_with_r.py` : a python wrapper (ML Pipeline Step) invoking R training script on ML Compute 
+- `code/training/R/train_with_r_on_databricks.py` : a python wrapper (ML Pipeline Step) invoking R training script on Databricks Compute
+- `code/training/R/weight_data.csv` : a sample dataset used by R script (r_train.r) to train a model
 
 ### Scoring
 - code/scoring/score.py : a scoring script which is about to be packed into a Docker Image along with a model while being deployed to QA/Prod environment.
diff --git a/docs/getting_started.md b/docs/getting_started.md
@@ -33,23 +33,25 @@ Please name your variable group **``devopsforai-aml-vg``** as we are using this
 
 The variable group should contain the following variables:
 
-| Variable Name               | Suggested Value              |
-| --------------------------- | ---------------------------- |
-| AML_COMPUTE_CLUSTER_CPU_SKU | STANDARD_DS2_V2              |
-| AML_COMPUTE_CLUSTER_NAME    | train-cluster                |
-| BASE_NAME                   | [unique base name]                        |
-| EVALUATE_SCRIPT_PATH        | evaluate/evaluate_model.py   |
-| EXPERIMENT_NAME             | mlopspython                  |
-| LOCATION                    | centralus                    |
-| MODEL_NAME                  | sklearn_regression_model.pkl |
-| REGISTER_SCRIPT_PATH        | register/register_model.py   |
-| SOURCES_DIR_TRAIN           | code                         |
-| SP_APP_ID                   |                              |
-| SP_APP_SECRET               |                              |
-| SUBSCRIPTION_ID             |                              |
-| TENANT_ID                   |                              |
-| TRAIN_SCRIPT_PATH           | training/train.py            |
-| TRAINING_PIPELINE_NAME      | training-pipeline            |
+| Variable Name               | Suggested Value                    |
+| --------------------------- | -----------------------------------|
+| AML_COMPUTE_CLUSTER_CPU_SKU | STANDARD_DS2_V2                    |
+| AML_COMPUTE_CLUSTER_NAME    | train-cluster                      |
+| BASE_NAME                   | [unique base name]                 |
+| DB_CLUSTER_ID               | [Optional Databricks cluster Id]   |
+| DATABRICKS_COMPUTE_NAME     | [Optional Databricks compute name] |
+| EVALUATE_SCRIPT_PATH        | evaluate/evaluate_model.py         |
+| EXPERIMENT_NAME             | mlopspython                        |
+| LOCATION                    | centralus                          |
+| MODEL_NAME                  | sklearn_regression_model.pkl       |
+| REGISTER_SCRIPT_PATH        | register/register_model.py         |
+| SOURCES_DIR_TRAIN           | code                               |
+| SP_APP_ID                   |                                    |
+| SP_APP_SECRET               |                                    |
+| SUBSCRIPTION_ID             |                                    |
+| TENANT_ID                   |                                    |
+| TRAIN_SCRIPT_PATH           | training/train.py                  |
+| TRAINING_PIPELINE_NAME      | training-pipeline                  |
 
 Mark **SP_APP_SECRET** variable as a secret one.
 
@@ -108,6 +110,8 @@ and checkout a published training pipeline in the **mlops-AML-WS** workspace in
 
 Great, you now have the build pipeline setup, you can either manually trigger it or it gets automatically triggered everytime there is a change in the master branch. The pipeline performs linting, unit testing, builds and publishes an **ML Training Pipeline** in an **ML Workspace**
 
+**Note:** The building pipeline contains disabled steps to build and publish ML pipelines using R to train a model. Enable these steps if you want to play with this approach. For the pipeline training a model with R on Databricks you have to manually create a Databricks cluster and attach it to the ML Workspace as a compute (Values DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables shoud be specified).
+
 ### 7. Train the Model
 
 The next step is to invoke the training pipeline created in the previous step. It can be done with a **Release Pipeline**. Click on the Pipelines/Releases menu, and then **New pipeline**, and then click on "Empty Job" on the "Select a template" window that pops to the right:
diff --git a/environment_setup/Dockerfile b/environment_setup/Dockerfile
@@ -8,6 +8,7 @@ LABEL org.label-schema.vendor = "Microsoft" \
 
 COPY environment_setup/requirements.txt  /setup/
 	
-RUN apt-get update && apt-get install gcc -y && pip install --upgrade -r /setup/requirements.txt   
+RUN apt-get update && apt-get install gcc -y && pip install --upgrade -r /setup/requirements.txt && \ 
+    conda install -c r r-essentials
 
 CMD ["python"]
diff --git a/environment_setup/requirements.txt b/environment_setup/requirements.txt
@@ -1,5 +1,6 @@
 pytest==4.3.0
 requests>=2.22
+azureml>=0.2
 azureml-sdk>=1.0
 python-dotenv>=0.10.3
 flake8
diff --git a/ml_service/pipelines/build_train_pipeline_with_r.py b/ml_service/pipelines/build_train_pipeline_with_r.py
@@ -0,0 +1,78 @@
+from azureml.pipeline.steps import PythonScriptStep
+from azureml.pipeline.core import Pipeline  # , PipelineData
+from azureml.core.runconfig import RunConfiguration, CondaDependencies
+# from azureml.core import Datastore
+import os
+import sys
+from dotenv import load_dotenv
+sys.path.append(os.path.abspath("./ml_service/util"))  # NOQA: E402
+from workspace import get_workspace
+from attach_compute import get_compute
+
+
+def main():
+    load_dotenv()
+    workspace_name = os.environ.get("BASE_NAME")+"-AML-WS"
+    resource_group = os.environ.get("BASE_NAME")+"-AML-RG"
+    subscription_id = os.environ.get("SUBSCRIPTION_ID")
+    tenant_id = os.environ.get("TENANT_ID")
+    app_id = os.environ.get("SP_APP_ID")
+    app_secret = os.environ.get("SP_APP_SECRET")
+    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
+    compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME")
+    build_id = os.environ.get("BUILD_BUILDID")
+    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
+
+    # Get Azure machine learning workspace
+    aml_workspace = get_workspace(
+        workspace_name,
+        resource_group,
+        subscription_id,
+        tenant_id,
+        app_id,
+        app_secret)
+    print(aml_workspace)
+
+    # Get Azure machine learning cluster
+    aml_compute = get_compute(
+        aml_workspace,
+        compute_name,
+        vm_size)
+    if aml_compute is not None:
+        print(aml_compute)
+
+    run_config = RunConfiguration(conda_dependencies=CondaDependencies.create(
+        conda_packages=['numpy', 'pandas',
+                        'scikit-learn', 'tensorflow', 'keras'],
+        pip_packages=['azure', 'azureml-core',
+                      'azure-storage',
+                      'azure-storage-blob'])
+    )
+    run_config.environment.docker.enabled = True
+    run_config.environment.docker.base_image = "mcr.microsoft.com/mlops/python"
+
+    train_step = PythonScriptStep(
+        name="Train Model",
+        script_name="train_with_r.py",
+        compute_target=aml_compute,
+        source_directory="code/training/R",
+        runconfig=run_config,
+        allow_reuse=False,
+    )
+    print("Step Train created")
+
+    steps = [train_step]
+
+    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
+    train_pipeline.validate()
+    published_pipeline = train_pipeline.publish(
+        name=pipeline_name + "_with_R",
+        description="Model training/retraining pipeline",
+        version=build_id
+    )
+    print(f'Published pipeline: {published_pipeline.name}')
+    print(f'for build {published_pipeline.version}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py b/ml_service/pipelines/build_train_pipeline_with_r_on_dbricks.py
@@ -0,0 +1,70 @@
+from azureml.pipeline.core import Pipeline
+import os
+import sys
+from dotenv import load_dotenv
+sys.path.append(os.path.abspath("./ml_service/util"))  # NOQA: E402
+from workspace import get_workspace
+from attach_compute import get_compute
+from azureml.pipeline.steps import DatabricksStep
+
+
+def main():
+    load_dotenv()
+    workspace_name = os.environ.get("BASE_NAME")+"-AML-WS"
+    resource_group = os.environ.get("BASE_NAME")+"-AML-RG"
+    subscription_id = os.environ.get("SUBSCRIPTION_ID")
+    tenant_id = os.environ.get("TENANT_ID")
+    app_id = os.environ.get("SP_APP_ID")
+    app_secret = os.environ.get("SP_APP_SECRET")
+    vm_size = os.environ.get("AML_COMPUTE_CLUSTER_CPU_SKU")
+    compute_name = os.environ.get("DATABRICKS_COMPUTE_NAME")
+    db_cluster_id = os.environ.get("DB_CLUSTER_ID")
+    build_id = os.environ.get("BUILD_BUILDID")
+    pipeline_name = os.environ.get("TRAINING_PIPELINE_NAME")
+
+    # Get Azure machine learning workspace
+    aml_workspace = get_workspace(
+        workspace_name,
+        resource_group,
+        subscription_id,
+        tenant_id,
+        app_id,
+        app_secret)
+    print(aml_workspace)
+
+    # Get Azure machine learning cluster
+    aml_compute = get_compute(
+        aml_workspace,
+        compute_name,
+        vm_size)
+    if aml_compute is not None:
+        print(aml_compute)
+
+    train_step = DatabricksStep(
+        name="DBPythonInLocalMachine",
+        num_workers=1,
+        python_script_name="train_with_r_on_databricks.py",
+        source_directory="code/training/R",
+        run_name='DB_Python_R_demo',
+        existing_cluster_id=db_cluster_id,
+        compute_target=aml_compute,
+        allow_reuse=False
+    )
+
+    print("Step Train created")
+
+    steps = [train_step]
+
+    train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
+    train_pipeline.validate()
+    published_pipeline = train_pipeline.publish(
+        name=pipeline_name + "_with_R_on_DB",
+        description="Model training/retraining pipeline",
+        version=build_id
+    )
+    print(f'Published pipeline: {published_pipeline.name}')
+    print(f'for build {published_pipeline.version}')
+
+
+if __name__ == '__main__':
+    main()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+import os`
	`2`	`+`
	`3`	`+os.system("Rscript r_train.r && ls -ltr model.rds")`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,30 @@ @@
 +height,weight
 +79,174
 +63,250
 +75,223
 +75,130
 +70,120
 +76,239
 +63,129
 +64,185
 +59,246
 +80,241
 +79,217
 +65,212
 +74,242
 +71,223
 +61,167
 +78,148
 +75,229
 +75,116
 +75,182
 +72,237
 +72,160
 +79,169
 +67,219
 +61,202
 +65,168
 +79,181
 +81,214
 +78,216
 +59,245