Deploy a model trained on Amazon SageMaker in a multicloud environment with ONNX (#4756)

rajavaid77 · web-flow · commit 4ca4fbb92467 · 2024-09-27T13:15:10.000-04:00
* Added code for deploying SageMaker trained model in other cloud with ONNX runtime

* formatting with black

* added/updated CI Badges

* Added README.md

* updated README.md to add link to the blog

* remove tmp file ~.xlsx accidently added
diff --git a/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/README.md b/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/README.md
@@ -0,0 +1,7 @@
+# Train and deploy ML models in a multicloud environment using Amazon SageMaker
+
+As customers accelerate their migrations to the cloud and transform their business, some find themselves in situations where they have to manage IT operations in a multicloud environment. For example, you might have acquired a company that was already running on a different cloud provider, or you may have a workload that generates value from unique capabilities provided by AWS. Another example is independent software vendors (ISVs) that make their products and services available in different cloud platforms to benefit their end customers. Or an organization may be operating in a Region where a primary cloud provider is not available, and in order to meet the data sovereignty or data residency requirements, they can use a secondary cloud provider.
+
+In this notebook, we demonstrate one of the many options that you have to take advantage of AWS’s broadest and deepest set of AI/ML capabilities in a multicloud environment. We show how you can build and train an ML model in AWS and deploy the model in another platform. We train the model using Amazon SageMaker, store the model artifacts in Amazon Simple Storage Service (Amazon S3), and deploy and run the model in Azure. This approach is beneficial if you use AWS services for ML for its most comprehensive set of features, yet you need to run your model in another cloud.
+
+For more details of the approach please read the blog [Train and deploy ML models in a multicloud environment using Amazon SageMaker](https://aws.amazon.com/blogs/machine-learning/train-and-deploy-ml-models-in-a-multicloud-environment-using-amazon-sagemaker/)
diff --git a/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/code/train.py b/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/code/train.py
@@ -0,0 +1,178 @@
+
+from __future__ import print_function
+import argparse
+import os
+import json
+import os
+import logging
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torchvision import datasets, transforms
+from torch.optim.lr_scheduler import StepLR
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+
+torch.manual_seed(0)
+
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, 3, 1)
+        self.conv2 = nn.Conv2d(32, 64, 3, 1)
+        self.dropout1 = nn.Dropout(0.25)
+        self.dropout2 = nn.Dropout(0.5)
+        self.fc1 = nn.Linear(9216, 128)
+        self.fc2 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = F.relu(x)
+        x = F.max_pool2d(x, 2)
+        x = self.dropout1(x)
+        x = torch.flatten(x, 1)
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout2(x)
+        x = self.fc2(x)
+        output = F.log_softmax(x, dim=1)
+        return output
+
+def train(args, model, device, train_loader, optimizer, epoch):
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+        if batch_idx % args.log_interval == 0:
+            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch, batch_idx * len(data), len(train_loader.dataset),
+                100. * batch_idx / len(train_loader), loss.item()))
+            if args.dry_run:
+                break
+
+
+def test(model, device, test_loader):
+    model.eval()
+    test_loss = 0
+    correct = 0
+    with torch.inference_mode():
+        for data, target in test_loader:
+            data, target = data.to(device), target.to(device)
+            output = model(data)
+            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
+            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
+            correct += pred.eq(target.view_as(pred)).sum().item()
+
+    test_loss /= len(test_loader.dataset)
+
+    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
+        test_loss, correct, len(test_loader.dataset),
+        100. * correct / len(test_loader.dataset)))
+
+
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
+    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
+                        help='input batch size for training (default: 64)')
+    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
+                        help='input batch size for testing (default: 1000)')
+    parser.add_argument('--epochs', type=int, default=14, metavar='N',
+                        help='number of epochs to train (default: 14)')
+    parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
+                        help='learning rate (default: 1.0)')
+    parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
+                        help='Learning rate step gamma (default: 0.7)')
+    parser.add_argument('--seed', type=int, default=1, metavar='S',
+                        help='random seed (default: 1)')
+    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
+                        help='how many batches to wait before logging training status')
+    parser.add_argument('--save-model', type=bool, default=False,
+                        help='For Saving the current Model')
+    parser.add_argument('--no-cuda', action='store_true', default=False,
+                        help='disables CUDA training')
+    parser.add_argument('--no-mps', action='store_true', default=False,
+                        help='disables macOS GPU training')
+    parser.add_argument('--dry-run', action='store_true', default=False,
+                        help='quickly check a single pass')
+
+    # Container environment
+    parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
+    parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
+    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
+    parser.add_argument("--data-dir", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
+    parser.add_argument("--num-gpus", type=int, default=os.environ["SM_NUM_GPUS"])
+
+    args = parser.parse_args()
+    use_cuda = not args.no_cuda and torch.cuda.is_available()
+    use_mps = not args.no_mps and torch.backends.mps.is_available()
+
+    torch.manual_seed(args.seed)
+
+    if use_cuda:
+        device = torch.device("cuda")
+    elif use_mps:
+        device = torch.device("mps")
+    else:
+        device = torch.device("cpu")
+
+    train_kwargs = {'batch_size': args.batch_size}
+    test_kwargs = {'batch_size': args.test_batch_size}
+    if use_cuda:
+        cuda_kwargs = {'num_workers': 1,
+                       'pin_memory': True,
+                       'shuffle': True}
+        train_kwargs.update(cuda_kwargs)
+        test_kwargs.update(cuda_kwargs)
+
+    transform=transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.1307,), (0.3081,))
+        ])
+    dataset1 = datasets.MNIST(args.data_dir, train=True, transform=transform)
+    dataset2 = datasets.MNIST(args.data_dir, train=False, transform=transform)
+    train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
+    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
+
+    model = Net().to(device)
+    optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
+
+    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
+    for epoch in range(1, args.epochs + 1):
+        train(args, model, device, train_loader, optimizer, epoch)
+        test(model, device, test_loader)
+        scheduler.step()
+
+    save_model(model, args.model_dir)
+    export_to_onnx(model,args.model_dir,device)
+
+def save_model(model, model_dir):
+    logger.info("Saving the model.")
+    path = os.path.join(model_dir, "model.pth")
+    # recommended way from http://pytorch.org/docs/master/notes/serialization.html
+    torch.save(model, path)        
+
+def export_to_onnx(model, model_dir, device):
+    logger.info("Exporting the model to onnx.")
+    dummy_input = torch.randn(1, 1, 28, 28).to(device)
+    input_names = [ "input_0" ]
+    output_names = [ "output_0" ]
+    path = os.path.join(model_dir, 'mnist-pytorch.onnx')
+    torch.onnx.export(model, dummy_input, path, verbose=True, input_names=input_names, output_names=output_names,
+                     dynamic_axes={'input_0' : {0 : 'batch_size'},    # variable length axes
+                                'output_0' : {0 : 'batch_size'}})
+
+if __name__ == '__main__':
+    main()
diff --git a/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/functionapp/mnist-onnx/function_app.py b/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/functionapp/mnist-onnx/function_app.py
@@ -0,0 +1,51 @@
+import logging
+import azure.functions as func
+import numpy as np
+import os
+import onnxruntime as ort
+import json
+
+
+app = func.FunctionApp()
+
+def preprocess(input_data_json):
+    # convert the JSON data into the tensor input
+    return np.array(input_data_json['data']).astype('float32')
+    
+def run_model(model_path, req_body):
+    session = ort.InferenceSession(model_path)
+    input_data = preprocess(req_body)
+    logging.info(f"Input Data shape is {input_data.shape}.")
+    input_name = session.get_inputs()[0].name  # get the id of the first input of the model   
+    try:
+        result = session.run([], {input_name: input_data})
+    except (RuntimeError) as e:
+        print("Shape={0} and error={1}".format(input_data.shape, e))
+    return result[0] 
+
+def get_model_path():
+    d=os.path.dirname(os.path.abspath(__file__))
+    return os.path.join(d , './model/mnist-pytorch.onnx')
+
+@app.function_name(name="mnist_classify")
+@app.route(route="classify", auth_level=func.AuthLevel.ANONYMOUS)
+def main(req: func.HttpRequest) -> func.HttpResponse:
+    logging.info('Python HTTP trigger function processed a request.')
+    # Get the img value from the post.
+    try:
+        req_body = req.get_json()
+    except ValueError:
+        pass
+
+    if req_body:
+        # run model
+        result = run_model(get_model_path(), req_body)
+        # map output to integer and return result string.
+        digits = np.argmax(result, axis=1)
+        logging.info(type(digits))
+        return func.HttpResponse(json.dumps({"digits": np.array(digits).tolist()}))
+    else:
+        return func.HttpResponse(
+             "This HTTP triggered function successfully.",
+             status_code=200
+        )
diff --git a/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/functionapp/mnist-onnx/host.json b/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/functionapp/mnist-onnx/host.json
@@ -0,0 +1,15 @@
+{
+  "version": "2.0",
+  "logging": {
+    "applicationInsights": {
+      "samplingSettings": {
+        "isEnabled": true,
+        "excludedTypes": "Request"
+      }
+    }
+  },
+  "extensionBundle": {
+    "id": "Microsoft.Azure.Functions.ExtensionBundle",
+    "version": "[4.*, 5.0.0)"
+  }
+}
diff --git a/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/functionapp/mnist-onnx/requirements.txt b/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/functionapp/mnist-onnx/requirements.txt
@@ -0,0 +1,7 @@
+# DO NOT include azure-functions-worker in this file
+# The Python Worker is managed by Azure Functions platform
+# Manually managing azure-functions-worker may cause unexpected issues
+
+azure-functions
+onnxruntime
+numpy==1.26
diff --git a/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/functionapp/settings.json b/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/functionapp/settings.json
@@ -0,0 +1,4 @@
+{
+    "AzureWebJobsFeatureFlags": "EnableWorkerIndexing",
+    "SCM_DO_BUILD_DURING_DEPLOYMENT": true
+}
diff --git a/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/mnist-train-using-pytorch.ipynb b/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/mnist-train-using-pytorch.ipynb
diff --git a/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/success.jpg b/     deploy_and_monitor/sm-multi_cloud_deployment_with_onnx/pytorch/success.jpg

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +    "AzureWebJobsFeatureFlags": "EnableWorkerIndexing",
 +    "SCM_DO_BUILD_DURING_DEPLOYMENT": true
 +}