Skip to content

Commit da60ab1

Browse files
eedorenkojotaylo
authored andcommitted
Use training data from a managed datasource (#134)
* Data from managed datastore * merge with unassigned variable fix * bugfix * typo * linting * linting * linting * added a link to az cli * doc update * reregistering a dataset * typo * rephrasing * rephrasing * auth enabled * revert auth enabled
1 parent cb27dd6 commit da60ab1

File tree

6 files changed

+57
-4
lines changed

6 files changed

+57
-4
lines changed

.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ MODEL_PATH = ''
3131
EVALUATE_SCRIPT_PATH = 'evaluate/evaluate_model.py'
3232
REGISTER_SCRIPT_PATH = 'register/register_model.py'
3333
SOURCES_DIR_TRAIN = 'code'
34+
DATASET_NAME = 'diabetes_ds'
35+
DATASTORE_NAME = 'datablobstore'
36+
DATAFILE_NAME = 'diabetes.csv'
3437

3538
# Optional. Used by a training pipeline with R on Databricks
3639
DB_CLUSTER_ID = ''

.pipelines/azdo-variables.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,6 @@ variables:
3939
- name: DB_CLUSTER_ID
4040
value: ''
4141
- name: SCORE_SCRIPT
42-
value: score.py
42+
value: score.py
43+
- name: DATASET_NAME
44+
value: diabetes_ds

code/training/train.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
POSSIBILITY OF SUCH DAMAGE.
2525
"""
2626
from azureml.core.run import Run
27+
from azureml.core import Dataset
2728
import os
2829
import argparse
2930
from sklearn.datasets import load_diabetes
@@ -69,19 +70,34 @@ def main():
6970
"must be a positive float.")
7071
)
7172

73+
parser.add_argument(
74+
"--dataset_name",
75+
type=str,
76+
help=("Dataset with the training data")
77+
)
7278
args = parser.parse_args()
7379

7480
print("Argument [build_id]: %s" % args.build_id)
7581
print("Argument [model_name]: %s" % args.model_name)
7682
print("Argument [alpha]: %s" % args.alpha)
83+
print("Argument [dataset_name]: %s" % args.dataset_name)
7784

7885
model_name = args.model_name
7986
build_id = args.build_id
8087
alpha = args.alpha
88+
dataset_name = args.dataset_name
8189

8290
run = Run.get_context()
91+
ws = run.experiment.workspace
92+
93+
if (dataset_name):
94+
dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)
95+
df = dataset.to_pandas_dataframe()
96+
X = df.values
97+
y = df.Y
98+
else:
99+
X, y = load_diabetes(return_X_y=True)
83100

84-
X, y = load_diabetes(return_X_y=True)
85101
X_train, X_test, y_train, y_test = train_test_split(
86102
X, y, test_size=0.2, random_state=0)
87103
data = {"train": {"X": X_train, "y": y_train},

docs/getting_started.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,10 @@ Check out the newly created resources in the [Azure Portal](portal.azure.com):
122122

123123
(Optional) To remove the resources created for this project you can use the [/environment_setup/iac-remove-environment.yml](../environment_setup/iac-remove-environment.yml) definition or you can just delete the resource group in the [Azure Portal](portal.azure.com).
124124

125+
**Note:** The training ML pipeline uses a [sample diabetes dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html) as training data. If you want to use your own dataset, you need to [create and register a datastore](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-access-data#azure-machine-learning-studio) in your ML workspace and upload the datafile (e.g. [diabetes.csv](./data/diabetes.csv)) to the corresponding blob container. You can also define a datastore in the ML Workspace with [az cli](https://docs.microsoft.com/en-us/cli/azure/ext/azure-cli-ml/ml/datastore?view=azure-cli-latest#ext-azure-cli-ml-az-ml-datastore-attach-blob).
126+
You'll also need to configure DATASTORE_NAME and DATAFILE_NAME variables in ***devopsforai-aml-vg*** variable group.
127+
128+
125129
## Create an Azure DevOps Azure ML Workspace Service Connection
126130
Install the **Azure Machine Learning** extension to your organization from the
127131
[marketplace](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.vss-services-azureml),

ml_service/pipelines/build_train_pipeline.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from azureml.pipeline.core import Pipeline
44
from azureml.core import Workspace
55
from azureml.core.runconfig import RunConfiguration, CondaDependencies
6+
from azureml.core import Dataset, Datastore
67
import os
78
import sys
89
sys.path.append(os.path.abspath("./ml_service/util")) # NOQA: E402
@@ -35,10 +36,10 @@ def main():
3536
'scikit-learn', 'tensorflow', 'keras'],
3637
pip_packages=['azure', 'azureml-core',
3738
'azure-storage',
38-
'azure-storage-blob'])
39+
'azure-storage-blob',
40+
'azureml-dataprep'])
3941
)
4042
run_config.environment.docker.enabled = True
41-
4243
config_envvar = {}
4344
if (e.collection_uri is not None and e.teamproject_name is not None):
4445
builduri_base = e.collection_uri + e.teamproject_name
@@ -53,6 +54,17 @@ def main():
5354
hyperparameter_alpha_param = PipelineParameter(
5455
name="hyperparameter_alpha", default_value=0.5)
5556

57+
dataset_name = ""
58+
if (e.datastore_name is not None and e.datafile_name is not None):
59+
dataset_name = e.dataset_name
60+
datastore = Datastore.get(aml_workspace, e.datastore_name)
61+
data_path = [(datastore, e.datafile_name)]
62+
dataset = Dataset.Tabular.from_delimited_files(path=data_path)
63+
dataset.register(workspace=aml_workspace,
64+
name=e.dataset_name,
65+
description="dataset with training data",
66+
create_new_version=True)
67+
5668
train_step = PythonScriptStep(
5769
name="Train Model",
5870
script_name=e.train_script_path,
@@ -62,6 +74,7 @@ def main():
6274
"--build_id", build_id_param,
6375
"--model_name", model_name_param,
6476
"--alpha", hyperparameter_alpha_param,
77+
"--dataset_name", dataset_name,
6578
],
6679
runconfig=run_config,
6780
allow_reuse=False,

ml_service/util/env_variables.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ def __init__(self):
4141
self._score_script = os.environ.get("SCORE_SCRIPT")
4242
self._collection_uri = os.environ.get("SYSTEM_COLLECTIONURI")
4343
self._teamproject_name = os.environ.get("SYSTEM_TEAMPROJECT")
44+
self._datastore_name = os.environ.get("DATASTORE_NAME")
45+
self._datafile_name = os.environ.get("DATAFILE_NAME")
46+
self._dataset_name = os.environ.get("DATASET_NAME")
4447

4548
@property
4649
def workspace_name(self):
@@ -145,3 +148,15 @@ def collection_uri(self):
145148
@property
146149
def teamproject_name(self):
147150
return self._teamproject_name
151+
152+
@property
153+
def datastore_name(self):
154+
return self._datastore_name
155+
156+
@property
157+
def datafile_name(self):
158+
return self._datafile_name
159+
160+
@property
161+
def dataset_name(self):
162+
return self._dataset_name

0 commit comments

Comments
 (0)