aws · kandakji · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024 · Jul 31, 2024
diff --git a/README.md b/README.md
@@ -196,6 +196,7 @@ More examples for models such as BERT and YOLOv5 can be found in [distributed_tr
 - [Train GPT-2 with Sharded Data Parallel](https://github.com/aws/amazon-sagemaker-examples/tree/main/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple-sharded-data-parallel.ipynb) shows how to train GPT-2 with near-linear scaling using Sharded Data Parallelism technique in SageMaker Model Parallelism Library.
 - [Train EleutherAI GPT-J with Model Parallel](https://github.com/aws/amazon-sagemaker-examples/blob/main/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb) shows how to train EleutherAI GPT-J with PyTorch and Tensor Parallelism technique in the SageMaker Model Parallelism Library.
 - [Train MaskRCNN with Data Parallel](https://github.com/aws/amazon-sagemaker-examples/blob/main/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb) shows how to train MaskRCNN with PyTorch and SageMaker Data Parallelism Library.
+- [Distributed training with TensorFlow Multi-Worker Mirrored Strategy API on Amazon SageMaker](https://github.com/aws/amazon-sagemaker-examples/blob/main/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb) shows how to train an MNIST Classifier with TensorFlow using TensorFlow's Multi-Worker Mirrored Strategy for Distributed Training.
 
 ### Amazon SageMaker Smart Sifting
 

diff --git a/training/distributed_training/index.rst b/training/distributed_training/index.rst
@@ -159,6 +159,15 @@ Horovod
    /sagemaker-python-sdk/keras_script_mode_pipe_mode_horovod/tensorflow_keras_CIFAR10
 
 
+TensorFlow Multi-Worker Mirrored Strategy (MWMS)
+-----------------------------------------
+
+.. toctree::
+   :maxdepth: 1
+
+   tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy
+
+
 .. _mxnet-distributed:
 
 Apache MXNet

diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py
@@ -0,0 +1,104 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.import tensorflow as tf
+
+import argparse
+import json
+import os
+
+import numpy as np
+import tensorflow as tf
+
+
+def model(x_train, y_train, x_test, y_test, strategy):
+    """Generate a simple model"""
+    with strategy.scope():
+
+        model = tf.keras.models.Sequential(
+            [
+                tf.keras.layers.Flatten(),
+                tf.keras.layers.Dense(1024, activation=tf.nn.relu),
+                tf.keras.layers.Dropout(0.4),
+                tf.keras.layers.Dense(10, activation=tf.nn.softmax),
+            ]
+        )
+
+        model.compile(
+            optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
+        )
+
+    model.fit(x_train, y_train)
+    model.evaluate(x_test, y_test)
+
+    return model
+
+
+def _load_training_data(base_dir):
+    """Load MNIST training data"""
+    x_train = np.load(os.path.join(base_dir, "train_data.npy"))
+    y_train = np.load(os.path.join(base_dir, "train_labels.npy"))
+    return x_train, y_train
+
+
+def _load_testing_data(base_dir):
+    """Load MNIST testing data"""
+    x_test = np.load(os.path.join(base_dir, "eval_data.npy"))
+    y_test = np.load(os.path.join(base_dir, "eval_labels.npy"))
+    return x_test, y_test
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Data, model, and output directories
+    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
+    parser.add_argument("--model_dir", type=str)
+    parser.add_argument("--sm-model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAINING"))
+    parser.add_argument("--hosts", type=list, default=json.loads(os.environ.get("SM_HOSTS")))
+    parser.add_argument("--current-host", type=str, default=os.environ.get("SM_CURRENT_HOST"))
+
+    return parser.parse_known_args()
+
+
+if __name__ == "__main__":
+    args, unknown = _parse_args()
+
+    train_data, train_labels = _load_training_data(args.train)
+    eval_data, eval_labels = _load_testing_data(args.train)
+
+    print("Tensorflow version: ", tf.__version__)
+    print("TF_CONFIG", os.environ.get("TF_CONFIG"))
+
+    communication_options = tf.distribute.experimental.CommunicationOptions(
+        implementation=tf.distribute.experimental.CommunicationImplementation.NCCL
+    )
+    strategy = tf.distribute.MultiWorkerMirroredStrategy(
+        communication_options=communication_options
+    )
+
+    print("Number of devices: {}".format(strategy.num_replicas_in_sync))
+
+    mnist_classifier = model(train_data, train_labels, eval_data, eval_labels, strategy)
+
+    task_type, task_id = (strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id)
+
+    print("Task type: ", task_type)
+    print("Task id: ", task_id)
+
+    # Save the model on chief worker
+    if strategy.cluster_resolver.task_id == 0:
+        print("Saving model on chief")
+        mnist_classifier.save(os.path.join(args.sm_model_dir, "000000001"))
+    else:
+        print("Saving model in /tmp on worker")
+        mnist_classifier.save(f"/tmp/{strategy.cluster_resolver.task_id}")
diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist.py b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist.py
@@ -0,0 +1,79 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.import tensorflow as tf
+
+import argparse
+import json
+import os
+
+import numpy as np
+import tensorflow as tf
+
+
+def model(x_train, y_train, x_test, y_test):
+    """Generate a simple model"""
+    model = tf.keras.models.Sequential(
+        [
+            tf.keras.layers.Flatten(),
+            tf.keras.layers.Dense(1024, activation=tf.nn.relu),
+            tf.keras.layers.Dropout(0.4),
+            tf.keras.layers.Dense(10, activation=tf.nn.softmax),
+        ]
+    )
+
+    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+    model.fit(x_train, y_train)
+    model.evaluate(x_test, y_test)
+
+    return model
+
+
+def _load_training_data(base_dir):
+    """Load MNIST training data"""
+    x_train = np.load(os.path.join(base_dir, "train_data.npy"))
+    y_train = np.load(os.path.join(base_dir, "train_labels.npy"))
+    return x_train, y_train
+
+
+def _load_testing_data(base_dir):
+    """Load MNIST testing data"""
+    x_test = np.load(os.path.join(base_dir, "eval_data.npy"))
+    y_test = np.load(os.path.join(base_dir, "eval_labels.npy"))
+    return x_test, y_test
+
+
+def _parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Data, model, and output directories
+    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
+    parser.add_argument("--model_dir", type=str)
+    parser.add_argument("--sm-model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
+    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAINING"))
+    parser.add_argument("--hosts", type=list, default=json.loads(os.environ.get("SM_HOSTS")))
+    parser.add_argument("--current-host", type=str, default=os.environ.get("SM_CURRENT_HOST"))
+
+    return parser.parse_known_args()
+
+
+if __name__ == "__main__":
+    args, unknown = _parse_args()
+
+    train_data, train_labels = _load_training_data(args.train)
+    eval_data, eval_labels = _load_testing_data(args.train)
+
+    mnist_classifier = model(train_data, train_labels, eval_data, eval_labels)
+
+    if args.current_host == args.hosts[0]:
+        # save model to an S3 directory with version number '00000001' in Tensorflow SavedModel Format
+        # To export the model as h5 format use model.save('my_model.h5')
+        mnist_classifier.save(os.path.join(args.sm_model_dir, "000000001"))