Skip to content

Commit 4ca4fbb

Browse files
authored
Deploy a model trained on Amazon SageMaker in a multicloud environment with ONNX (#4756)
* Added code for deploying SageMaker trained model in other cloud with ONNX runtime * formatting with black * added/updated CI Badges * Added README.md * updated README.md to add link to the blog * remove tmp file ~.xlsx accidently added
1 parent 70bd378 commit 4ca4fbb

File tree

8 files changed

+1947
-0
lines changed

8 files changed

+1947
-0
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Train and deploy ML models in a multicloud environment using Amazon SageMaker
2+
3+
As customers accelerate their migrations to the cloud and transform their business, some find themselves in situations where they have to manage IT operations in a multicloud environment. For example, you might have acquired a company that was already running on a different cloud provider, or you may have a workload that generates value from unique capabilities provided by AWS. Another example is independent software vendors (ISVs) that make their products and services available in different cloud platforms to benefit their end customers. Or an organization may be operating in a Region where a primary cloud provider is not available, and in order to meet the data sovereignty or data residency requirements, they can use a secondary cloud provider.
4+
5+
In this notebook, we demonstrate one of the many options that you have to take advantage of AWS’s broadest and deepest set of AI/ML capabilities in a multicloud environment. We show how you can build and train an ML model in AWS and deploy the model in another platform. We train the model using Amazon SageMaker, store the model artifacts in Amazon Simple Storage Service (Amazon S3), and deploy and run the model in Azure. This approach is beneficial if you use AWS services for ML for its most comprehensive set of features, yet you need to run your model in another cloud.
6+
7+
For more details of the approach please read the blog [Train and deploy ML models in a multicloud environment using Amazon SageMaker](https://aws.amazon.com/blogs/machine-learning/train-and-deploy-ml-models-in-a-multicloud-environment-using-amazon-sagemaker/)
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
2+
from __future__ import print_function
3+
import argparse
4+
import os
5+
import json
6+
import os
7+
import logging
8+
import sys
9+
10+
import torch
11+
import torch.nn as nn
12+
import torch.nn.functional as F
13+
import torch.optim as optim
14+
from torchvision import datasets, transforms
15+
from torch.optim.lr_scheduler import StepLR
16+
17+
18+
logger = logging.getLogger(__name__)
19+
logger.setLevel(logging.DEBUG)
20+
logger.addHandler(logging.StreamHandler(sys.stdout))
21+
22+
torch.manual_seed(0)
23+
24+
class Net(nn.Module):
25+
def __init__(self):
26+
super(Net, self).__init__()
27+
self.conv1 = nn.Conv2d(1, 32, 3, 1)
28+
self.conv2 = nn.Conv2d(32, 64, 3, 1)
29+
self.dropout1 = nn.Dropout(0.25)
30+
self.dropout2 = nn.Dropout(0.5)
31+
self.fc1 = nn.Linear(9216, 128)
32+
self.fc2 = nn.Linear(128, 10)
33+
34+
def forward(self, x):
35+
x = self.conv1(x)
36+
x = F.relu(x)
37+
x = self.conv2(x)
38+
x = F.relu(x)
39+
x = F.max_pool2d(x, 2)
40+
x = self.dropout1(x)
41+
x = torch.flatten(x, 1)
42+
x = self.fc1(x)
43+
x = F.relu(x)
44+
x = self.dropout2(x)
45+
x = self.fc2(x)
46+
output = F.log_softmax(x, dim=1)
47+
return output
48+
49+
def train(args, model, device, train_loader, optimizer, epoch):
50+
model.train()
51+
for batch_idx, (data, target) in enumerate(train_loader):
52+
data, target = data.to(device), target.to(device)
53+
optimizer.zero_grad()
54+
output = model(data)
55+
loss = F.nll_loss(output, target)
56+
loss.backward()
57+
optimizer.step()
58+
if batch_idx % args.log_interval == 0:
59+
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
60+
epoch, batch_idx * len(data), len(train_loader.dataset),
61+
100. * batch_idx / len(train_loader), loss.item()))
62+
if args.dry_run:
63+
break
64+
65+
66+
def test(model, device, test_loader):
67+
model.eval()
68+
test_loss = 0
69+
correct = 0
70+
with torch.inference_mode():
71+
for data, target in test_loader:
72+
data, target = data.to(device), target.to(device)
73+
output = model(data)
74+
test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
75+
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
76+
correct += pred.eq(target.view_as(pred)).sum().item()
77+
78+
test_loss /= len(test_loader.dataset)
79+
80+
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
81+
test_loss, correct, len(test_loader.dataset),
82+
100. * correct / len(test_loader.dataset)))
83+
84+
85+
def main():
86+
# Training settings
87+
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
88+
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
89+
help='input batch size for training (default: 64)')
90+
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
91+
help='input batch size for testing (default: 1000)')
92+
parser.add_argument('--epochs', type=int, default=14, metavar='N',
93+
help='number of epochs to train (default: 14)')
94+
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
95+
help='learning rate (default: 1.0)')
96+
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
97+
help='Learning rate step gamma (default: 0.7)')
98+
parser.add_argument('--seed', type=int, default=1, metavar='S',
99+
help='random seed (default: 1)')
100+
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
101+
help='how many batches to wait before logging training status')
102+
parser.add_argument('--save-model', type=bool, default=False,
103+
help='For Saving the current Model')
104+
parser.add_argument('--no-cuda', action='store_true', default=False,
105+
help='disables CUDA training')
106+
parser.add_argument('--no-mps', action='store_true', default=False,
107+
help='disables macOS GPU training')
108+
parser.add_argument('--dry-run', action='store_true', default=False,
109+
help='quickly check a single pass')
110+
111+
# Container environment
112+
parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
113+
parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
114+
parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
115+
parser.add_argument("--data-dir", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
116+
parser.add_argument("--num-gpus", type=int, default=os.environ["SM_NUM_GPUS"])
117+
118+
args = parser.parse_args()
119+
use_cuda = not args.no_cuda and torch.cuda.is_available()
120+
use_mps = not args.no_mps and torch.backends.mps.is_available()
121+
122+
torch.manual_seed(args.seed)
123+
124+
if use_cuda:
125+
device = torch.device("cuda")
126+
elif use_mps:
127+
device = torch.device("mps")
128+
else:
129+
device = torch.device("cpu")
130+
131+
train_kwargs = {'batch_size': args.batch_size}
132+
test_kwargs = {'batch_size': args.test_batch_size}
133+
if use_cuda:
134+
cuda_kwargs = {'num_workers': 1,
135+
'pin_memory': True,
136+
'shuffle': True}
137+
train_kwargs.update(cuda_kwargs)
138+
test_kwargs.update(cuda_kwargs)
139+
140+
transform=transforms.Compose([
141+
transforms.ToTensor(),
142+
transforms.Normalize((0.1307,), (0.3081,))
143+
])
144+
dataset1 = datasets.MNIST(args.data_dir, train=True, transform=transform)
145+
dataset2 = datasets.MNIST(args.data_dir, train=False, transform=transform)
146+
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
147+
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
148+
149+
model = Net().to(device)
150+
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
151+
152+
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
153+
for epoch in range(1, args.epochs + 1):
154+
train(args, model, device, train_loader, optimizer, epoch)
155+
test(model, device, test_loader)
156+
scheduler.step()
157+
158+
save_model(model, args.model_dir)
159+
export_to_onnx(model,args.model_dir,device)
160+
161+
def save_model(model, model_dir):
162+
logger.info("Saving the model.")
163+
path = os.path.join(model_dir, "model.pth")
164+
# recommended way from http://pytorch.org/docs/master/notes/serialization.html
165+
torch.save(model, path)
166+
167+
def export_to_onnx(model, model_dir, device):
168+
logger.info("Exporting the model to onnx.")
169+
dummy_input = torch.randn(1, 1, 28, 28).to(device)
170+
input_names = [ "input_0" ]
171+
output_names = [ "output_0" ]
172+
path = os.path.join(model_dir, 'mnist-pytorch.onnx')
173+
torch.onnx.export(model, dummy_input, path, verbose=True, input_names=input_names, output_names=output_names,
174+
dynamic_axes={'input_0' : {0 : 'batch_size'}, # variable length axes
175+
'output_0' : {0 : 'batch_size'}})
176+
177+
if __name__ == '__main__':
178+
main()
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import logging
2+
import azure.functions as func
3+
import numpy as np
4+
import os
5+
import onnxruntime as ort
6+
import json
7+
8+
9+
app = func.FunctionApp()
10+
11+
def preprocess(input_data_json):
12+
# convert the JSON data into the tensor input
13+
return np.array(input_data_json['data']).astype('float32')
14+
15+
def run_model(model_path, req_body):
16+
session = ort.InferenceSession(model_path)
17+
input_data = preprocess(req_body)
18+
logging.info(f"Input Data shape is {input_data.shape}.")
19+
input_name = session.get_inputs()[0].name # get the id of the first input of the model
20+
try:
21+
result = session.run([], {input_name: input_data})
22+
except (RuntimeError) as e:
23+
print("Shape={0} and error={1}".format(input_data.shape, e))
24+
return result[0]
25+
26+
def get_model_path():
27+
d=os.path.dirname(os.path.abspath(__file__))
28+
return os.path.join(d , './model/mnist-pytorch.onnx')
29+
30+
@app.function_name(name="mnist_classify")
31+
@app.route(route="classify", auth_level=func.AuthLevel.ANONYMOUS)
32+
def main(req: func.HttpRequest) -> func.HttpResponse:
33+
logging.info('Python HTTP trigger function processed a request.')
34+
# Get the img value from the post.
35+
try:
36+
req_body = req.get_json()
37+
except ValueError:
38+
pass
39+
40+
if req_body:
41+
# run model
42+
result = run_model(get_model_path(), req_body)
43+
# map output to integer and return result string.
44+
digits = np.argmax(result, axis=1)
45+
logging.info(type(digits))
46+
return func.HttpResponse(json.dumps({"digits": np.array(digits).tolist()}))
47+
else:
48+
return func.HttpResponse(
49+
"This HTTP triggered function successfully.",
50+
status_code=200
51+
)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"version": "2.0",
3+
"logging": {
4+
"applicationInsights": {
5+
"samplingSettings": {
6+
"isEnabled": true,
7+
"excludedTypes": "Request"
8+
}
9+
}
10+
},
11+
"extensionBundle": {
12+
"id": "Microsoft.Azure.Functions.ExtensionBundle",
13+
"version": "[4.*, 5.0.0)"
14+
}
15+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# DO NOT include azure-functions-worker in this file
2+
# The Python Worker is managed by Azure Functions platform
3+
# Manually managing azure-functions-worker may cause unexpected issues
4+
5+
azure-functions
6+
onnxruntime
7+
numpy==1.26
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"AzureWebJobsFeatureFlags": "EnableWorkerIndexing",
3+
"SCM_DO_BUILD_DURING_DEPLOYMENT": true
4+
}

0 commit comments

Comments
 (0)