-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Expand file tree
/
Copy pathhftransformers-fridgeobjects-multiclass-classification.sh
More file actions
211 lines (177 loc) · 8.37 KB
/
hftransformers-fridgeobjects-multiclass-classification.sh
File metadata and controls
211 lines (177 loc) · 8.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#!/bin/bash
set -x
# script inputs
registry_name="azureml"
subscription_id="<SUBSCRIPTION_ID>"
resource_group_name="<RESOURCE_GROUP>"
workspace_name="<WORKSPACE_NAME>"
compute_cluster_model_import="sample-model-import-cluster"
compute_cluster_finetune="sample-finetune-cluster-gpu"
# using the same compute cluster for model evaluation as finetuning. If you want to use a different cluster, specify it below
compute_model_evaluation="sample-finetune-cluster-gpu"
# If above compute cluster does not exist, create it with the following vm size
compute_model_import_sku="Standard_D12"
compute_finetune_sku="STANDARD_NC4AS_T4_V3"
compute_model_evaluation_sku="STANDARD_NC4AS_T4_V3"
# This is the number of GPUs in a single node of the selected 'vm_size' compute.
# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.
# Setting this to more than the number of GPUs will result in an error.
gpus_per_node=1
# huggingFace model
huggingface_model_name="microsoft/beit-base-patch16-224-pt22k-ft22k"
# This is the foundation model for finetuning from azureml system registry
aml_registry_model_name="microsoft-beit-base-patch16-224-pt22k-ft22k"
model_label="latest"
version=$(date +%s)
finetuned_huggingface_model_name="microsoft-beit-base-patch16-224-pt22k-ft22k-fridge-objects-multiclass-classification"
huggingface_endpoint_name="hf-mc-fridge-items-$version"
deployment_sku="Standard_DS3_V2"
# Deepspeed config
ds_finetune="./deepspeed_configs/zero1.json"
# Scoring file
huggingface_sample_request_data="./huggingface_sample_request_data.json"
# finetuning job parameters
finetuning_pipeline_component="transformers_image_classification_pipeline"
# Training settings
process_count_per_instance=$gpus_per_node # set to the number of GPUs available in the compute
# 1. Install dependencies
pip install azure-ai-ml>=1.23.1
pip install azure-identity==1.13.0
# 2. Setup pre-requisites
az account set -s $subscription_id
workspace_info="--resource-group $resource_group_name --workspace-name $workspace_name"
# Check if $compute_cluster_model_import exists, else create it
if az ml compute show --name $compute_cluster_model_import $workspace_info
then
echo "Compute cluster $compute_cluster_model_import already exists"
else
echo "Creating compute cluster $compute_cluster_model_import"
az ml compute create --name $compute_cluster_model_import --type amlcompute --min-instances 0 --max-instances 2 --size $compute_model_import_sku $workspace_info || {
echo "Failed to create compute cluster $compute_cluster_model_import"
exit 1
}
fi
# Check if $compute_cluster_finetune exists, else create it
if az ml compute show --name $compute_cluster_finetune $workspace_info
then
echo "Compute cluster $compute_cluster_finetune already exists"
else
echo "Creating compute cluster $compute_cluster_finetune"
az ml compute create --name $compute_cluster_finetune --type amlcompute --min-instances 0 --max-instances 2 --size $compute_finetune_sku $workspace_info || {
echo "Failed to create compute cluster $compute_cluster_finetune"
exit 1
}
fi
# Check if $compute_model_evaluation exists, else create it
if az ml compute show --name $compute_model_evaluation $workspace_info
then
echo "Compute cluster $compute_model_evaluation already exists"
else
echo "Creating compute cluster $compute_model_evaluation"
az ml compute create --name $compute_model_evaluation --type amlcompute --min-instances 0 --max-instances 2 --size $compute_model_evaluation_sku $workspace_info || {
echo "Failed to create compute cluster $compute_model_evaluation"
exit 1
}
fi
# Check if the finetuning pipeline component exists
if ! az ml component show --name $finetuning_pipeline_component --label latest --registry-name $registry_name
then
echo "Finetuning pipeline component $finetuning_pipeline_component does not exist"
exit 1
fi
# 3. Check if the model exists in the registry
# need to confirm model show command works for registries outside the tenant (aka system registry)
if ! az ml model show --name $aml_registry_model_name --label $model_label --registry-name $registry_name
then
echo "Model $aml_registry_model_name:$model_label does not exist in registry $registry_name"
exit 1
fi
# Get the latest model version
model_version=$(az ml model show --name $aml_registry_model_name --label $model_label --registry-name $registry_name --query version --output tsv)
# 4. Prepare data
python prepare_data.py --subscription $subscription_id --group $resource_group_name --workspace $workspace_name
# training data
train_data="./data/training-mltable-folder"
# validation data
validation_data="./data/validation-mltable-folder"
# test data
# Using the same data for validation and test. If you want to use a different dataset for test, specify it below
test_data="./data/validation-mltable-folder"
# Check if training data, validation data exist
if [ ! -d $train_data ]; then
echo "Training data $train_data does not exist"
exit 1
fi
if [ ! -d $validation_data ]; then
echo "Validation data $validation_data does not exist"
exit 1
fi
if [ ! -d $test_data ]; then
echo "Test data $test_data does not exist"
exit 1
fi
# 5. Submit finetuning job using pipeline.yaml for a open-mmlab mmdetection model
# # If you want to use a HuggingFace model, specify the inputs.model_name instead of inputs.mlflow_model_path.path like below
# inputs.model_name=$huggingface_model_name
huggingface_parent_job_name=$( az ml job create \
--file "./hftransformers-fridgeobjects-multiclass-classification-pipeline.yaml" \
$workspace_info \
--query name -o tsv \
--set jobs.huggingface_transformers_model_finetune_job.component="azureml://registries/$registry_name/components/$finetuning_pipeline_component/labels/latest" \
inputs.mlflow_model_path.path="azureml://registries/$registry_name/models/$aml_registry_model_name/versions/$model_version" \
inputs.training_data.path=$train_data \
inputs.validation_data.path=$validation_data \
inputs.test_data.path=$test_data \
inputs.compute_model_import=$compute_cluster_model_import \
inputs.compute_finetune=$compute_cluster_finetune \
inputs.compute_model_evaluation=$compute_model_evaluation
) || {
echo "Failed to submit finetuning job"
exit 1
}
az ml job stream --name $huggingface_parent_job_name $workspace_info || {
echo "job stream failed"; exit 1;
}
# 6. Create model in workspace from train job output for fine-tuned HuggingFace Transformers model
az ml model create --name $finetuned_huggingface_model_name --version $version --type mlflow_model \
--path azureml://jobs/$huggingface_parent_job_name/outputs/trained_model $workspace_info || {
echo "model create in workspace failed"; exit 1;
}
# 7. Deploy the fine-tuned HuggingFace Transformers model to an endpoint
# Create online endpoint
az ml online-endpoint create --name $huggingface_endpoint_name $workspace_info || {
echo "endpoint create failed"; exit 1;
}
# Deploy model from registry to endpoint in workspace
az ml online-deployment create --file ./deploy.yaml $workspace_info --set \
endpoint_name=$huggingface_endpoint_name model=azureml:$finetuned_huggingface_model_name:$version \
instance_type=$deployment_sku || {
echo "deployment create failed"; exit 1;
}
# get deployment name and set all traffic to the new deployment
yaml_file="deploy.yaml"
get_yaml_value() {
grep "$1:" "$yaml_file" | awk '{print $2}' | sed 's/[",]//g'
}
deployment_name=$(get_yaml_value "name")
az ml online-endpoint update $workspace_info --name=$huggingface_endpoint_name --traffic="$deployment_name=100" || {
echo "Failed to set all traffic to the new deployment"
exit 1
}
# 8. Try a sample scoring request on the deployed HuggingFace Transformers model
# Check if scoring data file exists
if [ -f $huggingface_sample_request_data ]; then
echo "Invoking endpoint $huggingface_endpoint_name with $huggingface_sample_request_data\n\n"
else
echo "Scoring file $huggingface_sample_request_data does not exist"
exit 1
fi
az ml online-endpoint invoke --name $huggingface_endpoint_name --request-file $huggingface_sample_request_data $workspace_info || {
echo "endpoint invoke failed"; exit 1;
}
# 9. Delete the endpoint
az ml online-endpoint delete --name $huggingface_endpoint_name $workspace_info --yes || {
echo "endpoint delete failed"; exit 1;
}
# 10. Delete the request data file
rm $huggingface_sample_request_data