From a821fa6b592f24bacda81718011aa1297784de79 Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 10:24:31 +0200 Subject: [PATCH 01/14] Create mnist.py script --- .../multi_worker_mirrored_strategy/mnist.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist.py diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist.py b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist.py new file mode 100644 index 0000000000..c86f1b14db --- /dev/null +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist.py @@ -0,0 +1,79 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License.import tensorflow as tf + +import argparse +import json +import os + +import numpy as np +import tensorflow as tf + + +def model(x_train, y_train, x_test, y_test): + """Generate a simple model""" + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(1024, activation=tf.nn.relu), + tf.keras.layers.Dropout(0.4), + tf.keras.layers.Dense(10, activation=tf.nn.softmax), + ] + ) + + model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) + model.fit(x_train, y_train) + model.evaluate(x_test, y_test) + + return model + + +def _load_training_data(base_dir): + """Load MNIST training data""" + x_train = np.load(os.path.join(base_dir, "train_data.npy")) + y_train = np.load(os.path.join(base_dir, "train_labels.npy")) + return x_train, y_train + + +def _load_testing_data(base_dir): + """Load MNIST testing data""" + x_test = np.load(os.path.join(base_dir, "eval_data.npy")) + y_test = np.load(os.path.join(base_dir, "eval_labels.npy")) + return x_test, y_test + + +def _parse_args(): + parser = argparse.ArgumentParser() + + # Data, model, and output directories + # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket. + parser.add_argument("--model_dir", type=str) + parser.add_argument("--sm-model-dir", type=str, default=os.environ.get("SM_MODEL_DIR")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAINING")) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ.get("SM_HOSTS"))) + parser.add_argument("--current-host", type=str, default=os.environ.get("SM_CURRENT_HOST")) + + return parser.parse_known_args() + + +if __name__ == "__main__": + args, unknown = _parse_args() + + train_data, train_labels = _load_training_data(args.train) + eval_data, eval_labels = _load_testing_data(args.train) + + mnist_classifier = model(train_data, train_labels, eval_data, eval_labels) + + if args.current_host == args.hosts[0]: + # save model to an S3 directory with version number '00000001' in Tensorflow SavedModel Format + # To export the model as h5 format use model.save('my_model.h5') + mnist_classifier.save(os.path.join(args.sm_model_dir, "000000001")) From aaa64d3b62e0504ae49992a7d1ee8dc173b1a4d8 Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 10:40:39 +0200 Subject: [PATCH 02/14] Create mnist-distributed.py script --- .../mnist-distributed.py | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py new file mode 100644 index 0000000000..21a0621520 --- /dev/null +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py @@ -0,0 +1,95 @@ +# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License.import tensorflow as tf + +import argparse +import json +import os + +import numpy as np +import tensorflow as tf + + +def model(x_train, y_train, x_test, y_test): + """Generate a simple model""" + communication_options = tf.distribute.experimental.CommunicationOptions( + implementation=tf.distribute.experimental.CommunicationImplementation.NCCL) + strategy = tf.distribute.MultiWorkerMirroredStrategy( + communication_options=communication_options) + + print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) + + + + + with strategy.scope(): + + model = tf.keras.models.Sequential( + [ + tf.keras.layers.Flatten(), + tf.keras.layers.Dense(1024, activation=tf.nn.relu), + tf.keras.layers.Dropout(0.4), + tf.keras.layers.Dense(10, activation=tf.nn.softmax), + ] + ) + + model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) + + model.fit(x_train, y_train) + model.evaluate(x_test, y_test) + + return model + + +def _load_training_data(base_dir): + """Load MNIST training data""" + x_train = np.load(os.path.join(base_dir, "train_data.npy")) + y_train = np.load(os.path.join(base_dir, "train_labels.npy")) + return x_train, y_train + + +def _load_testing_data(base_dir): + """Load MNIST testing data""" + x_test = np.load(os.path.join(base_dir, "eval_data.npy")) + y_test = np.load(os.path.join(base_dir, "eval_labels.npy")) + return x_test, y_test + + +def _parse_args(): + parser = argparse.ArgumentParser() + + # Data, model, and output directories + # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket. + parser.add_argument("--model_dir", type=str) + parser.add_argument("--sm-model-dir", type=str, default=os.environ.get("SM_MODEL_DIR")) + parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAINING")) + parser.add_argument("--hosts", type=list, default=json.loads(os.environ.get("SM_HOSTS"))) + parser.add_argument("--current-host", type=str, default=os.environ.get("SM_CURRENT_HOST")) + + return parser.parse_known_args() + + +if __name__ == "__main__": + args, unknown = _parse_args() + + train_data, train_labels = _load_training_data(args.train) + eval_data, eval_labels = _load_testing_data(args.train) + + print("Tensorflow version: ", tf.__version__) + + + mnist_classifier = model(train_data, train_labels, eval_data, eval_labels) + + if args.current_host == args.hosts[0]: + # save model to an S3 directory with version number '00000001' in Tensorflow SavedModel Format + # To export the model as h5 format use model.save('my_model.h5') + mnist_classifier.save(os.path.join(args.sm_model_dir, "000000001")) From 7034c2f08a76b12bcd9d5a9347d5a37da4b67b72 Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:09:11 +0200 Subject: [PATCH 03/14] Update mnist-distributed.py --- .../mnist-distributed.py | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py index 21a0621520..a6b8ea453b 100644 --- a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py @@ -19,18 +19,8 @@ import tensorflow as tf -def model(x_train, y_train, x_test, y_test): +def model(x_train, y_train, x_test, y_test, strategy): """Generate a simple model""" - communication_options = tf.distribute.experimental.CommunicationOptions( - implementation=tf.distribute.experimental.CommunicationImplementation.NCCL) - strategy = tf.distribute.MultiWorkerMirroredStrategy( - communication_options=communication_options) - - print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) - - - - with strategy.scope(): model = tf.keras.models.Sequential( @@ -85,11 +75,27 @@ def _parse_args(): eval_data, eval_labels = _load_testing_data(args.train) print("Tensorflow version: ", tf.__version__) - + print("TF_CONFIG", os.environ.get("TF_CONFIG")) - mnist_classifier = model(train_data, train_labels, eval_data, eval_labels) - - if args.current_host == args.hosts[0]: - # save model to an S3 directory with version number '00000001' in Tensorflow SavedModel Format - # To export the model as h5 format use model.save('my_model.h5') + communication_options = tf.distribute.experimental.CommunicationOptions( + implementation=tf.distribute.experimental.CommunicationImplementation.NCCL) + strategy = tf.distribute.MultiWorkerMirroredStrategy( + communication_options=communication_options) + + print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) + + mnist_classifier = model(train_data, train_labels, eval_data, eval_labels, strategy) + + task_type, task_id = (strategy.cluster_resolver.task_type, + strategy.cluster_resolver.task_id) + + print("Task type: ",task_type) + print("Task id: ",task_id) + + # Save the model on chief worker + if strategy.cluster_resolver.task_id == 0: + print("Saving model on chief") mnist_classifier.save(os.path.join(args.sm_model_dir, "000000001")) + else: + print("Saving model in /tmp on worker") + mnist_classifier.save(f"/tmp/{strategy.cluster_resolver.task_id}") From f01cd02fb012d16572dfbb86a4574c8a59a32f1f Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:09:41 +0200 Subject: [PATCH 04/14] Create tensorflow_multi_worker_mirrored_strategy.ipynb --- ...rflow_multi_worker_mirrored_strategy.ipynb | 344 ++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb new file mode 100644 index 0000000000..8ffb673d98 --- /dev/null +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb @@ -0,0 +1,344 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed training with TensorFlow Distribute Strategy API on Amazon SageMaker" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[Tensorflow's Distributed Training API](https://www.tensorflow.org/guide/distributed_training) enables multiple strategies for distributed training natively in Tensorflow. In this example, we will use the [SageMaker Python SDK](https://github.com/aws/sagemaker-python-sdk) to run a distributed training job on the training instance using a Tensorflow training script and SageMaker Deep Learning Container (DLC) for TensorFlow training. We will use the popular MNIST dataset to train a classifier based on a Simple Neural Network architecture.\n", + "\n", + "We will start with a non-distributed Neuron Network MNIST training script and then adapt it to use distributed training." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Set up the environment\n", + "\n", + "Let's start by setting up the environment:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "\n", + "sagemaker_session = sagemaker.Session()\n", + "\n", + "role = get_execution_role()\n", + "region = sagemaker_session.boto_session.region_name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training Data\n", + "\n", + "We will use the MNIST dataset has been already loaded to the public S3 buckets ``sagemaker-sample-data-`` under the prefix ``tensorflow/mnist``. There are four ``.npy`` file under this prefix:\n", + "* ``train_data.npy``\n", + "* ``eval_data.npy``\n", + "* ``train_labels.npy``\n", + "* ``eval_labels.npy``" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "training_data_uri = \"s3://sagemaker-sample-data-{}/tensorflow/mnist\".format(region)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Construct the training script\n", + "\n", + "This tutorial's training script is based on a [SageMaker MNIST example](https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker-python-sdk/tensorflow_script_mode_training_and_serving/mnist-2.py). Here is the entire script:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# TensorFlow script\n", + "!pygmentize 'mnist.py'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a training job using the `TensorFlow` estimator\n", + "\n", + "The `sagemaker.tensorflow.TensorFlow` estimator handles locating the training container based on the framework version and the job type (Inference or Training), uploading your script to a S3 location and creating a SageMaker training job. Let's call out a couple important parameters here:\n", + "\n", + "* `framework_version` is set to `'2.14.1'` to indicate the TensorFlow version we want to use for executing your model training code. This will indicate to SageMaker which DLC should be used. Here's the list of the [available Deep Learning Container Images](https://github.com/aws/deep-learning-containers/blob/master/available_images.md).\n", + "\n", + "* `entry_point` is the absolute or relative path to the local Python source file that should be executed as the entry point to training. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from sagemaker.tensorflow import TensorFlow\n", + "\n", + "local_mode = True\n", + "\n", + "if local_mode:\n", + " instance_type = \"local_gpu\"\n", + " instance_count=1\n", + "else:\n", + " instance_type = \"ml.g5.xlarge\"\n", + " instance_count=1\n", + "\n", + "mnist_estimator = TensorFlow(\n", + " entry_point=\"mnist.py\",\n", + " role=role,\n", + " instance_count=instance_count,\n", + " instance_type=instance_type,\n", + " framework_version=\"2.14.1\",\n", + " py_version=\"py310\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Calling ``fit``\n", + "\n", + "To start a training job, we call `estimator.fit(training_data_uri)`.\n", + "\n", + "An S3 location is used here as the input. `fit` creates a default channel named `'training'`, which points to this S3 location. In the training script we can then access the training data from the location stored in `SM_CHANNEL_TRAINING`. `fit` accepts a couple other types of input as well. See the API doc [here](https://sagemaker.readthedocs.io/en/stable/estimators.html#sagemaker.estimator.EstimatorBase.fit) for details.\n", + "\n", + "When training starts, the TensorFlow container executes mnist.py, passing `hyperparameters` and `model_dir` from the estimator as script arguments. Because we didn't define either in this example, no hyperparameters are passed, and `model_dir` defaults to `s3:///`, so the script execution is as follows:\n", + "```bash\n", + "python mnist.py --model_dir s3:///\n", + "```\n", + "When training is complete, the training job will upload the saved model to Amazon S3." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calling fit to train a model with TensorFlow script." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "mnist_estimator.fit(training_data_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adapt the training job and training script to use Distribtued training\n", + "\n", + "In this section, we use an adapter training script that leverages Tensorflow distributed training. We will use the [`MultiWorkerMirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#multiworkermirroredstrategy) which performs Distributed Data Parallelism\n", + "\n", + "MultiWorkerMirroredStrategy has two implementations for cross-device communications:\n", + "\n", + "1. RING is RPC-based and supports both CPUs and GPUs.\n", + "\n", + "2. NCCL uses [NVIDIA Collective Communications Library (NCCL)](https://developer.nvidia.com/nccl) which provides state-of-art performance on GPUs but it doesn't support CPUs.\n", + "\n", + "In this implementation we will defers the choice to Tensorflow, which will use NCCL in case GPU devices are used." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here are the changes we implement in the script:\n", + "1. Instantiate the Multi-Worker Mirrored Strategy and the Communication Option\n", + "\n", + "```python\n", + "communication_options = tf.distribute.experimental.CommunicationOptions(\n", + " implementation=tf.distribute.experimental.CommunicationImplementation.NCCL)\n", + "strategy = tf.distribute.MultiWorkerMirroredStrategy(\n", + " communication_options=communication_options)\n", + "```\n", + "\n", + "2. Prints the number of devices (replicas) involved in the distributed strategy\n", + "\n", + "```python\n", + "print('Number of devices: {}'.format(strategy.num_replicas_in_sync))\n", + "```\n", + "\n", + "3. In the `main` method, move the model definition and compilation inside the strategy scope context to ensure they are distributed across the defined devices\n", + "\n", + "```python\n", + "with strategy.scope():\n", + " model = tf.keras.models.Sequential(\n", + " [\n", + " tf.keras.layers.Flatten(),\n", + " tf.keras.layers.Dense(1024, activation=tf.nn.relu),\n", + " tf.keras.layers.Dropout(0.4),\n", + " tf.keras.layers.Dense(10, activation=tf.nn.softmax),\n", + " ]\n", + " )\n", + "\n", + " model.compile(optimizer=\"adam\", loss=\"sparse_categorical_crossentropy\", metrics=[\"accuracy\"])\n", + "```\n", + "\n", + "3. Sove the model only the chief worker\n", + "```python\n", + "if strategy.cluster_resolver.task_id == 0:\n", + " print(\"Saving model on chief\")\n", + " mnist_classifier.save(os.path.join(args.sm_model_dir, \"000000001\"))\n", + "else:\n", + " print(\"Saving model in /tmp on worker\")\n", + " mnist_classifier.save(f\"/tmp/{strategy.cluster_resolver.task_id}\")\n", + "\n", + "```\n", + "\n", + "---\n", + "\n", + "Here is the entire script:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# TensorFlow script\n", + "!pygmentize 'mnist-distributed.py'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we modify the `sagemaker.tensorflow.TensorFlow` estimator by changing the `entry_point` to the new script and and adding a distribution strategy.\n", + "\n", + "To enable [`MultiWorkerMirroredStrategy`](https://www.tensorflow.org/guide/distributed_training#multiworkermirroredstrategy) we use the following configuration:\n", + "\n", + "```python\n", + "{\n", + " \"multi_worker_mirrored_strategy\": {\n", + " \"enabled\": True\n", + " }\n", + "}\n", + "```\n", + "\n", + "This distribution strategy option is available for TensorFlow 2.9 and later in the SageMaker Python SDK v2.xx.yy and later." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "local_mode = False\n", + "\n", + "if local_mode:\n", + " instance_type = \"local_gpu\"\n", + " instance_count=1\n", + "else:\n", + " instance_type = \"ml.g5.24xlarge\"\n", + " instance_count=2\n", + "\n", + "mnist_estimator_distibuted = TensorFlow(\n", + " entry_point=\"mnist-distributed.py\",\n", + " role=role,\n", + " instance_count=instance_count,\n", + " instance_type=instance_type,\n", + " framework_version=\"2.14.1\",\n", + " py_version=\"py310\",\n", + " distribution={\n", + " \"multi_worker_mirrored_strategy\": {\n", + " \"enabled\": True\n", + " }\n", + " }\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Calling fit to train a model with TensorFlow script." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "mnist_estimator_distibuted.fit(training_data_uri)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_tensorflow2_p310", + "language": "python", + "name": "conda_tensorflow2_p310" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 332401e19b6bcd01b3e88306870d3ddf9c90f2b8 Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:19:32 +0200 Subject: [PATCH 05/14] lint mnist-distributed.py --- .../mnist-distributed.py | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py index a6b8ea453b..95169d7e80 100644 --- a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py @@ -32,8 +32,10 @@ def model(x_train, y_train, x_test, y_test, strategy): ] ) - model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) - + model.compile( + optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"] + ) + model.fit(x_train, y_train) model.evaluate(x_test, y_test) @@ -73,25 +75,26 @@ def _parse_args(): train_data, train_labels = _load_training_data(args.train) eval_data, eval_labels = _load_testing_data(args.train) - + print("Tensorflow version: ", tf.__version__) print("TF_CONFIG", os.environ.get("TF_CONFIG")) - + communication_options = tf.distribute.experimental.CommunicationOptions( - implementation=tf.distribute.experimental.CommunicationImplementation.NCCL) + implementation=tf.distribute.experimental.CommunicationImplementation.NCCL + ) strategy = tf.distribute.MultiWorkerMirroredStrategy( - communication_options=communication_options) - - print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) - + communication_options=communication_options + ) + + print("Number of devices: {}".format(strategy.num_replicas_in_sync)) + mnist_classifier = model(train_data, train_labels, eval_data, eval_labels, strategy) - - task_type, task_id = (strategy.cluster_resolver.task_type, - strategy.cluster_resolver.task_id) - - print("Task type: ",task_type) - print("Task id: ",task_id) - + + task_type, task_id = (strategy.cluster_resolver.task_type, strategy.cluster_resolver.task_id) + + print("Task type: ", task_type) + print("Task id: ", task_id) + # Save the model on chief worker if strategy.cluster_resolver.task_id == 0: print("Saving model on chief") From e57404d9db1cf4ed054a08009f7d6069e52d2ddf Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:25:56 +0200 Subject: [PATCH 06/14] Update tensorflow_multi_worker_mirrored_strategy.ipynb --- ...rflow_multi_worker_mirrored_strategy.ipynb | 71 ++++++++++++++++--- 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb index 8ffb673d98..f1fb28dd2a 100644 --- a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb @@ -7,6 +7,19 @@ "# Distributed training with TensorFlow Distribute Strategy API on Amazon SageMaker" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook.\n", + "\n", + "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "---" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -118,10 +131,10 @@ "\n", "if local_mode:\n", " instance_type = \"local_gpu\"\n", - " instance_count=1\n", + " instance_count = 1\n", "else:\n", " instance_type = \"ml.g5.xlarge\"\n", - " instance_count=1\n", + " instance_count = 1\n", "\n", "mnist_estimator = TensorFlow(\n", " entry_point=\"mnist.py\",\n", @@ -129,7 +142,7 @@ " instance_count=instance_count,\n", " instance_type=instance_type,\n", " framework_version=\"2.14.1\",\n", - " py_version=\"py310\"\n", + " py_version=\"py310\",\n", ")" ] }, @@ -280,10 +293,10 @@ "\n", "if local_mode:\n", " instance_type = \"local_gpu\"\n", - " instance_count=1\n", + " instance_count = 1\n", "else:\n", " instance_type = \"ml.g5.24xlarge\"\n", - " instance_count=2\n", + " instance_count = 2\n", "\n", "mnist_estimator_distibuted = TensorFlow(\n", " entry_point=\"mnist-distributed.py\",\n", @@ -292,11 +305,7 @@ " instance_type=instance_type,\n", " framework_version=\"2.14.1\",\n", " py_version=\"py310\",\n", - " distribution={\n", - " \"multi_worker_mirrored_strategy\": {\n", - " \"enabled\": True\n", - " }\n", - " }\n", + " distribution={\"multi_worker_mirrored_strategy\": {\"enabled\": True}},\n", ")" ] }, @@ -317,6 +326,48 @@ "source": [ "mnist_estimator_distibuted.fit(training_data_uri)" ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Notebook CI Test Results\n", + "\n", + "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", + "\n", + "\n", + "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/NOTEBOOK_PATH)\n", + "\n", + "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", + "\n", + "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n" + ] } ], "metadata": { From 30abe063b6a8c571388cc0fd3b56b8042ff3dd62 Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:40:34 +0200 Subject: [PATCH 07/14] Update distributed_training index.rst --- training/distributed_training/index.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/training/distributed_training/index.rst b/training/distributed_training/index.rst index 17c42631ad..ee22ce8707 100644 --- a/training/distributed_training/index.rst +++ b/training/distributed_training/index.rst @@ -159,6 +159,15 @@ Horovod /sagemaker-python-sdk/keras_script_mode_pipe_mode_horovod/tensorflow_keras_CIFAR10 +TensorFlow Multi-Worker Mirrored Strategy (MWMS) +----------------------------------------- + +.. toctree:: + :maxdepth: 1 + + tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy + + .. _mxnet-distributed: Apache MXNet From 08cec8f51fafb8a0613869c08d08161391e17aa9 Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:47:51 +0200 Subject: [PATCH 08/14] Update main README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index cc8b819911..b23af5dbf9 100644 --- a/README.md +++ b/README.md @@ -196,6 +196,7 @@ More examples for models such as BERT and YOLOv5 can be found in [distributed_tr - [Train GPT-2 with Sharded Data Parallel](https://github.com/aws/amazon-sagemaker-examples/tree/main/training/distributed_training/pytorch/model_parallel/gpt2/smp-train-gpt-simple-sharded-data-parallel.ipynb) shows how to train GPT-2 with near-linear scaling using Sharded Data Parallelism technique in SageMaker Model Parallelism Library. - [Train EleutherAI GPT-J with Model Parallel](https://github.com/aws/amazon-sagemaker-examples/blob/main/training/distributed_training/pytorch/model_parallel/gpt-j/11_train_gptj_smp_tensor_parallel_notebook.ipynb) shows how to train EleutherAI GPT-J with PyTorch and Tensor Parallelism technique in the SageMaker Model Parallelism Library. - [Train MaskRCNN with Data Parallel](https://github.com/aws/amazon-sagemaker-examples/blob/main/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb) shows how to train MaskRCNN with PyTorch and SageMaker Data Parallelism Library. +- [Distributed training with TensorFlow Multi-Worker Mirrored Strategy API on Amazon SageMaker](https://github.com/aws/amazon-sagemaker-examples/blob/main/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb) shows how to train an MNIST Classifier with TensorFlow using TensorFlow's Multi-Worker Mirrored Strategy for Distributed Training. ### Amazon SageMaker Smart Sifting From 981555614ef4a8fe317b8d09ab803a64150c1777 Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:13:33 +0200 Subject: [PATCH 09/14] Update mnist.py dataset --- .../tensorflow/multi_worker_mirrored_strategy/mnist.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist.py b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist.py index c86f1b14db..5a96972c82 100644 --- a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist.py +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist.py @@ -39,15 +39,15 @@ def model(x_train, y_train, x_test, y_test): def _load_training_data(base_dir): """Load MNIST training data""" - x_train = np.load(os.path.join(base_dir, "train_data.npy")) - y_train = np.load(os.path.join(base_dir, "train_labels.npy")) + x_train = np.load(os.path.join(base_dir, "input_train.npy")) + y_train = np.load(os.path.join(base_dir, "input_train_labels.npy")) return x_train, y_train def _load_testing_data(base_dir): """Load MNIST testing data""" - x_test = np.load(os.path.join(base_dir, "eval_data.npy")) - y_test = np.load(os.path.join(base_dir, "eval_labels.npy")) + x_test = np.load(os.path.join(base_dir, "input_test.npy")) + y_test = np.load(os.path.join(base_dir, "input_test_labels.npy")) return x_test, y_test From c3f22ce5c8476ef9462aebcce64702a8c80dc310 Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:13:53 +0200 Subject: [PATCH 10/14] Update mnist-distributed.py dataset --- .../multi_worker_mirrored_strategy/mnist-distributed.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py index 95169d7e80..bc75fd749a 100644 --- a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/mnist-distributed.py @@ -44,15 +44,15 @@ def model(x_train, y_train, x_test, y_test, strategy): def _load_training_data(base_dir): """Load MNIST training data""" - x_train = np.load(os.path.join(base_dir, "train_data.npy")) - y_train = np.load(os.path.join(base_dir, "train_labels.npy")) + x_train = np.load(os.path.join(base_dir, "input_train.npy")) + y_train = np.load(os.path.join(base_dir, "input_train_labels.npy")) return x_train, y_train def _load_testing_data(base_dir): """Load MNIST testing data""" - x_test = np.load(os.path.join(base_dir, "eval_data.npy")) - y_test = np.load(os.path.join(base_dir, "eval_labels.npy")) + x_test = np.load(os.path.join(base_dir, "input_test.npy")) + y_test = np.load(os.path.join(base_dir, "input_test_labels.npy")) return x_test, y_test From f9de3cead7f853c47598a1956d13903e918b669a Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:15:04 +0200 Subject: [PATCH 11/14] Update tensorflow_multi_worker_mirrored_strategy.ipynb dataset --- ...tensorflow_multi_worker_mirrored_strategy.ipynb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb index f1fb28dd2a..bfbc2b956d 100644 --- a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb @@ -62,11 +62,11 @@ "source": [ "## Training Data\n", "\n", - "We will use the MNIST dataset has been already loaded to the public S3 buckets ``sagemaker-sample-data-`` under the prefix ``tensorflow/mnist``. There are four ``.npy`` file under this prefix:\n", - "* ``train_data.npy``\n", - "* ``eval_data.npy``\n", - "* ``train_labels.npy``\n", - "* ``eval_labels.npy``" + "We will use the MNIST dataset has been already loaded to the public S3 buckets ``sagemaker-example-files-prod-`` under the prefix ``datasets/image/MNIST``. There are four ``.npy`` file under this prefix:\n", + "* ``input_train.npy``\n", + "* ``input_test.npy``\n", + "* ``input_train_labels.npy``\n", + "* ``input_test_labels.npy``" ] }, { @@ -77,7 +77,7 @@ }, "outputs": [], "source": [ - "training_data_uri = \"s3://sagemaker-sample-data-{}/tensorflow/mnist\".format(region)" + "training_data_uri = \"s3://sagemaker-example-files-prod-{}/datasets/image/MNIST/numpy\".format(region)" ] }, { @@ -127,7 +127,7 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "local_mode = True\n", + "local_mode = False\n", "\n", "if local_mode:\n", " instance_type = \"local_gpu\"\n", From fd7f3f66a3f8a6fce2796c9e9df5adf50b66a5f7 Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:25:08 +0200 Subject: [PATCH 12/14] Update tensorflow_multi_worker_mirrored_strategy.ipynb badges --- .../tensorflow_multi_worker_mirrored_strategy.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb index bfbc2b956d..e92c88f3a3 100644 --- a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb @@ -354,7 +354,7 @@ "\n", "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", "\n", - "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/NOTEBOOK_PATH)\n", + "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", "\n", "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/training|distributed_training|tensorflow|multi_worker_mirrored_strategy|tensorflow_multi_worker_mirrored_strategy.ipynb)\n", "\n", From dd8a3962efbefdac963b5bca72fefc112f30be9b Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:38:25 +0200 Subject: [PATCH 13/14] Update sagemaker SDK in tensorflow_multi_worker_mirrored_strategy.ipynb --- .../tensorflow_multi_worker_mirrored_strategy.ipynb | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb index e92c88f3a3..be36a8ab83 100644 --- a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb @@ -38,6 +38,17 @@ "Let's start by setting up the environment:" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "! pip install -U sagemaker" + ] + }, { "cell_type": "code", "execution_count": null, @@ -127,7 +138,7 @@ "source": [ "from sagemaker.tensorflow import TensorFlow\n", "\n", - "local_mode = False\n", + "local_mode = True\n", "\n", "if local_mode:\n", " instance_type = \"local_gpu\"\n", From 169c7878d8417ceec5c664e37687f7bf298cf22f Mon Sep 17 00:00:00 2001 From: kandakji <44462355+kandakji@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:56:18 +0200 Subject: [PATCH 14/14] downgrade TF version for PR test tensorflow_multi_worker_mirrored_strategy.ipynb --- .../tensorflow_multi_worker_mirrored_strategy.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb index be36a8ab83..c181e3b0c1 100644 --- a/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb +++ b/training/distributed_training/tensorflow/multi_worker_mirrored_strategy/tensorflow_multi_worker_mirrored_strategy.ipynb @@ -122,7 +122,7 @@ "\n", "The `sagemaker.tensorflow.TensorFlow` estimator handles locating the training container based on the framework version and the job type (Inference or Training), uploading your script to a S3 location and creating a SageMaker training job. Let's call out a couple important parameters here:\n", "\n", - "* `framework_version` is set to `'2.14.1'` to indicate the TensorFlow version we want to use for executing your model training code. This will indicate to SageMaker which DLC should be used. Here's the list of the [available Deep Learning Container Images](https://github.com/aws/deep-learning-containers/blob/master/available_images.md).\n", + "* `framework_version` is set to `'2.13.0'` to indicate the TensorFlow version we want to use for executing your model training code. This will indicate to SageMaker which DLC should be used. Here's the list of the [available Deep Learning Container Images](https://github.com/aws/deep-learning-containers/blob/master/available_images.md).\n", "\n", "* `entry_point` is the absolute or relative path to the local Python source file that should be executed as the entry point to training. \n", "\n" @@ -152,7 +152,7 @@ " role=role,\n", " instance_count=instance_count,\n", " instance_type=instance_type,\n", - " framework_version=\"2.14.1\",\n", + " framework_version=\"2.13.0\",\n", " py_version=\"py310\",\n", ")" ] @@ -314,7 +314,7 @@ " role=role,\n", " instance_count=instance_count,\n", " instance_type=instance_type,\n", - " framework_version=\"2.14.1\",\n", + " framework_version=\"2.13.0\",\n", " py_version=\"py310\",\n", " distribution={\"multi_worker_mirrored_strategy\": {\"enabled\": True}},\n", ")"