change: Warn if parameter server is used with multi-GPU instance (#1376)

ajaykarpur · web-flow · commit 4d0c2e67ea32 · 2020-03-26T16:57:32.000-07:00
Distributed training with parameter server and multi-GPU instances is not supported. Warn the user that training will not fully leverage all the GPU cores if parameter server is enabled and a multi-GPU instance is selected.
diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py
@@ -13,6 +13,7 @@
 """Utility methods used by framework classes"""
 from __future__ import absolute_import
 
+import logging
 import os
 import re
 import shutil
@@ -23,6 +24,8 @@
 from sagemaker import s3
 from sagemaker.utils import get_ecr_image_uri_prefix, ECR_URI_PATTERN
 
+logger = logging.getLogger("sagemaker")
+
 _TAR_SOURCE_FILENAME = "source.tar.gz"
 
 UploadedCode = namedtuple("UserCode", ["s3_prefix", "script_name"])
@@ -42,6 +45,13 @@
     "Python 2. Newer versions of {framework} will only be available for Python 3."
     "Please set the argument \"py_version='py3'\" to use the Python 3 {framework} image."
 )
+PARAMETER_SERVER_MULTI_GPU_WARNING = (
+    "You have selected a multi-GPU training instance type. "
+    "You have also enabled parameter server for distributed training. "
+    "Distributed training with the default parameter server configuration will not "
+    "fully leverage all GPU cores; the parameter server will be configured to run "
+    "only one worker per host regardless of the number of GPUs."
+)
 
 
 EMPTY_FRAMEWORK_VERSION_ERROR = (
@@ -68,6 +78,7 @@
 DEFAULT_ACCOUNT = "520713654638"
 ASIMOV_PROD_ACCOUNT = "763104351884"
 ASIMOV_DEFAULT_ACCOUNT = ASIMOV_PROD_ACCOUNT
+SINGLE_GPU_INSTANCE_TYPES = ("ml.p2.xlarge", "ml.p3.2xlarge")
 
 MERGED_FRAMEWORKS_REPO_MAP = {
     "tensorflow-scriptmode": "tensorflow-training",
@@ -490,6 +501,44 @@ def empty_framework_version_warning(default_version, latest_version):
     return " ".join(msgs)
 
 
+def warn_if_parameter_server_with_multi_gpu(training_instance_type, distributions):
+    """Warn the user that training will not fully leverage all the GPU
+    cores if parameter server is enabled and a multi-GPU instance is selected.
+    Distributed training with the default parameter server setup doesn't
+    support multi-GPU instances.
+
+    Args:
+        training_instance_type (str): A string representing the type of training instance selected.
+        distributions (dict): A dictionary with information to enable distributed training.
+            (Defaults to None if distributed training is not enabled.) For example:
+
+            .. code:: python
+
+                {
+                    'parameter_server':
+                    {
+                        'enabled': True
+                    }
+                }
+
+
+    """
+    if training_instance_type == "local" or distributions is None:
+        return
+
+    is_multi_gpu_instance = (
+        training_instance_type.split(".")[1].startswith("p")
+        and training_instance_type not in SINGLE_GPU_INSTANCE_TYPES
+    )
+
+    ps_enabled = "parameter_server" in distributions and distributions["parameter_server"].get(
+        "enabled", False
+    )
+
+    if is_multi_gpu_instance and ps_enabled:
+        logger.warning(PARAMETER_SERVER_MULTI_GPU_WARNING)
+
+
 def get_unsupported_framework_version_error(
     framework_name, unsupported_version, supported_versions
 ):
diff --git a/src/sagemaker/mxnet/estimator.py b/src/sagemaker/mxnet/estimator.py
@@ -22,6 +22,7 @@
     empty_framework_version_warning,
     python_deprecation_warning,
     is_version_equal_or_higher,
+    warn_if_parameter_server_with_multi_gpu,
 )
 from sagemaker.mxnet import defaults
 from sagemaker.mxnet.model import MXNetModel
@@ -126,6 +127,12 @@ def __init__(
                 python_deprecation_warning(self.__framework_name__, defaults.LATEST_PY2_VERSION)
             )
 
+        if distributions is not None:
+            train_instance_type = kwargs.get("train_instance_type")
+            warn_if_parameter_server_with_multi_gpu(
+                training_instance_type=train_instance_type, distributions=distributions
+            )
+
         self.py_version = py_version
         self._configure_distribution(distributions)
 
diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py
@@ -307,6 +307,12 @@ def __init__(
                 fw.python_deprecation_warning(self.__framework_name__, defaults.LATEST_PY2_VERSION)
             )
 
+        if distributions is not None:
+            train_instance_type = kwargs.get("train_instance_type")
+            fw.warn_if_parameter_server_with_multi_gpu(
+                training_instance_type=train_instance_type, distributions=distributions
+            )
+
         if "enable_sagemaker_metrics" not in kwargs:
             # enable sagemaker metrics for TF v1.15 or greater:
             if fw.is_version_equal_or_higher([1, 15], self.framework_version):
diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py
@@ -1170,3 +1170,13 @@ def test_region_supports_debugger_feature_returns_true_for_supported_regions():
 def test_region_supports_debugger_feature_returns_false_for_unsupported_regions():
     assert fw_utils._region_supports_debugger("us-gov-west-1") is False
     assert fw_utils._region_supports_debugger("us-iso-east-1") is False
+
+
+def test_warn_if_parameter_server_with_multi_gpu(caplog):
+    train_instance_type = "ml.p2.8xlarge"
+    distributions = {"parameter_server": {"enabled": True}}
+
+    fw_utils.warn_if_parameter_server_with_multi_gpu(
+        training_instance_type=train_instance_type, distributions=distributions
+    )
+    assert fw_utils.PARAMETER_SERVER_MULTI_GPU_WARNING in caplog.text