feature: add model parallelism support (#441)

akhilmehra · ChoiByungWook · commit bbb3f169cb96 · 2020-12-08T14:46:06.000-08:00
diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py
@@ -76,6 +76,96 @@ def validate_source_dir(script, directory):
     return True
 
 
+def get_mp_parameters(distribution):
+    """Get the model parallelism parameters provided by the user
+
+    Args:
+        distribution: distribution dictionary defined by the user
+
+    Returns:
+        params: dictionary containing model parallelism parameters
+        to be used for training
+    """
+    try:
+        mp_dict = distribution["smdistributed"]["modelparallel"]
+    except KeyError:
+        mp_dict = {}
+    if mp_dict.get("enabled", False) is True:
+        params = mp_dict.get("parameters", {})
+        validate_mp_config(params)
+        return params
+    return None
+
+
+def validate_mp_config(config):
+    """Validate the configuration dictionary for model parallelism.
+
+    Args:
+       config (dict): Dictionary holding configuration keys and values.
+
+    Raises:
+        ValueError: If any of the keys have incorrect values.
+    """
+
+    if "partitions" not in config:
+        raise ValueError("'partitions' is a required parameter.")
+
+    def validate_positive(key):
+        try:
+            if not isinstance(config[key], int) or config[key] < 1:
+                raise ValueError(f"The number of {key} must be a positive integer.")
+        except KeyError:
+            pass
+
+    def validate_in(key, vals):
+        try:
+            if config[key] not in vals:
+                raise ValueError(f"{key} must be a value in: {vals}.")
+        except KeyError:
+            pass
+
+    def validate_bool(keys):
+        validate_in(keys, [True, False])
+
+    validate_in("pipeline", ["simple", "interleaved", "_only_forward"])
+    validate_in("placement_strategy", ["spread", "cluster"])
+    validate_in("optimize", ["speed", "memory"])
+
+    for key in ["microbatches", "partitions"]:
+        validate_positive(key)
+
+    for key in ["auto_partition", "contiguous", "load_partition", "horovod", "ddp"]:
+        validate_bool(key)
+
+    if "partition_file" in config and not isinstance(config.get("partition_file"), str):
+        raise ValueError("'partition_file' must be a str.")
+
+    if config.get("auto_partition") is False and "default_partition" not in config:
+        raise ValueError("default_partition must be supplied if auto_partition is set to False!")
+
+    if "default_partition" in config and config["default_partition"] >= config["partitions"]:
+        raise ValueError("default_partition must be less than the number of partitions!")
+
+    if "memory_weight" in config and (
+        config["memory_weight"] > 1.0 or config["memory_weight"] < 0.0
+    ):
+        raise ValueError("memory_weight must be between 0.0 and 1.0!")
+
+    if "ddp_port" in config and "ddp" not in config:
+        raise ValueError("`ddp_port` needs `ddp` to be set as well")
+
+    if "ddp_dist_backend" in config and "ddp" not in config:
+        raise ValueError("`ddp_dist_backend` needs `ddp` to be set as well")
+
+    if "ddp_port" in config:
+        if not isinstance(config["ddp_port"], int) or config["ddp_port"] < 0:
+            value = config["ddp_port"]
+            raise ValueError(f"Invalid port number {value}.")
+
+    if config.get("horovod", False) and config.get("ddp", False):
+        raise ValueError("'ddp' and 'horovod' cannot be simultaneously enabled.")
+
+
 def tar_and_upload_dir(
     session,
     bucket,
diff --git a/src/sagemaker/pytorch/estimator.py b/src/sagemaker/pytorch/estimator.py
@@ -24,7 +24,9 @@
     framework_version_from_tag,
     python_deprecation_warning,
     validate_version_or_image_args,
+    warn_if_parameter_server_with_multi_gpu,
     validate_smdistributed,
+    get_mp_parameters,
 )
 from sagemaker.pytorch import defaults
 from sagemaker.pytorch.model import PyTorchModel
@@ -93,7 +95,6 @@ def __init__(
                 for training and hosting, instead of selecting the appropriate
                 SageMaker official image based on framework_version and
                 py_version. It can be an ECR url or dockerhub image and tag.
-
                 Examples:
                     * ``123412341234.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0``
                     * ``custom-image:latest``
@@ -102,17 +103,41 @@ def __init__(
                 ``image_uri`` is required. If also ``None``, then a ``ValueError``
                 will be raised.
             distribution (dict): A dictionary with information on how to run distributed training
-                (default: None). Currently we support distributed training with SMDistributed
-                Data Parallel strategy.
+                (default: None). Currently we support distributed training with parameter servers,
+                Model Parallelism, Data Parallelism, and MPI. Model Parallelism can only be used
+                with MPI.
+                To enable parameter server use the following setup:
+
+                .. code:: python
+
+                    {
+                        "parameter_server": {
+                            "enabled": True
+                        }
+                    }
+
+                To enable MPI:
+
+                .. code:: python
+
+                    {
+                        "mpi": {
+                            "enabled": True
+                        }
+                    }
 
-                To enable SMDistributed Data Parallel:
+                To enable SMDistributed Data Parallel or Model Parallel:
 
                 .. code:: python
 
                     {
                         "smdistributed": {
                             "dataparallel": {
                                 "enabled": True
+                            },
+                            "modelparallel": {
+                                "enabled": True,
+                                "parameters": {}
                             }
                         }
                     }
@@ -148,6 +173,10 @@ def __init__(
                 image_uri=image_uri,
             )
 
+            warn_if_parameter_server_with_multi_gpu(
+                training_instance_type=instance_type, distribution=distribution
+            )
+
         if "enable_sagemaker_metrics" not in kwargs:
             # enable sagemaker metrics for PT v1.3 or greater:
             if self.framework_version and Version(self.framework_version) >= Version("1.3"):
@@ -163,6 +192,30 @@ def hyperparameters(self):
         hyperparameters = super(PyTorch, self).hyperparameters()
         additional_hyperparameters = {}
 
+        if "parameter_server" in self.distribution:
+            ps_enabled = self.distribution.get("parameter_server").get("enabled", False)
+            additional_hyperparameters[self.LAUNCH_PS_ENV_NAME] = ps_enabled
+
+        if "mpi" in self.distribution:
+            mpi_dict = self.distribution["mpi"]
+            mpi_enabled = mpi_dict.get("enabled", False)
+            additional_hyperparameters[self.LAUNCH_MPI_ENV_NAME] = mpi_enabled
+
+            if mpi_dict.get("processes_per_host"):
+                additional_hyperparameters[self.MPI_NUM_PROCESSES_PER_HOST] = mpi_dict.get(
+                    "processes_per_host"
+                )
+
+            additional_hyperparameters[self.MPI_CUSTOM_MPI_OPTIONS] = mpi_dict.get(
+                "custom_mpi_options", ""
+            )
+
+            if get_mp_parameters(self.distribution):
+                additional_hyperparameters["mp_parameters"] = get_mp_parameters(self.distribution)
+
+        elif "modelparallel" in self.distribution.get("smdistributed", {}):
+            raise ValueError("Cannot use Model Parallelism without MPI enabled!")
+
         if "smdistributed" in self.distribution:
             # smdistributed strategy selected
             smdistributed = self.distribution["smdistributed"]
diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py
@@ -81,8 +81,9 @@ def __init__(
                 ``image_uri`` is required. If also ``None``, then a ``ValueError``
                 will be raised.
             distribution (dict): A dictionary with information on how to run distributed training
-                (default: None). Currently we support distributed training with parameter servers
-                and MPI.
+                (default: None). Currently we support distributed training with parameter servers,
+                Model Parallelism, Data Parallelism, and MPI. Model Parallelism can only be used
+                with MPI.
                 To enable parameter server use the following setup:
 
                 .. code:: python
@@ -103,14 +104,18 @@ def __init__(
                         }
                     }
 
-                To enable SMDistributed Data Parallel:
+                To enable SMDistributed Data Parallel or Model Parallel:
 
                 .. code:: python
 
                     {
                         "smdistributed": {
                             "dataparallel": {
                                 "enabled": True
+                            },
+                            "modelparallel": {
+                                "enabled": True,
+                                "parameters": {}
                             }
                         }
                     }
@@ -335,6 +340,14 @@ def hyperparameters(self):
                 "custom_mpi_options", ""
             )
 
+            if fw.get_mp_parameters(self.distribution):
+                additional_hyperparameters["mp_parameters"] = fw.get_mp_parameters(
+                    self.distribution
+                )
+
+        elif "modelparallel" in self.distribution.get("smdistributed", {}):
+            raise ValueError("Cannot use Model Parallelism without MPI enabled!")
+
         if "smdistributed" in self.distribution:
             # smdistributed strategy selected
             smdistributed = self.distribution["smdistributed"]
diff --git a/tests/integ/test_pytorch.py b/tests/integ/test_pytorch.py
@@ -36,6 +36,28 @@
 EIA_SCRIPT = os.path.join(EIA_DIR, "empty_inference_script.py")
 
 
+@pytest.fixture(scope="module", name="pytorch_mpi_training_job")
+def fixture_mpi_training_job(
+    sagemaker_session,
+    pytorch_training_latest_version,
+    pytorch_training_latest_py_version,
+    cpu_instance_type,
+):
+
+    distribution_dict = {"mpi": {"enabled": True}}
+    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
+        pytorch = _get_pytorch_estimator(
+            sagemaker_session,
+            pytorch_training_latest_version,
+            pytorch_training_latest_py_version,
+            cpu_instance_type,
+            distributions_dict=distribution_dict,
+        )
+
+        pytorch.fit({"training": _upload_training_data(pytorch)})
+        return pytorch.latest_training_job.name
+
+
 @pytest.fixture(scope="module", name="pytorch_training_job")
 def fixture_training_job(
     sagemaker_session,
@@ -220,7 +242,12 @@ def _upload_training_data(pytorch):
 
 
 def _get_pytorch_estimator(
-    sagemaker_session, pytorch_version, py_version, instance_type, entry_point=MNIST_SCRIPT
+    sagemaker_session,
+    pytorch_version,
+    py_version,
+    instance_type,
+    entry_point=MNIST_SCRIPT,
+    distributions_dict={},
 ):
     return PyTorch(
         entry_point=entry_point,
@@ -230,6 +257,7 @@ def _get_pytorch_estimator(
         instance_count=1,
         instance_type=instance_type,
         sagemaker_session=sagemaker_session,
+        distributions=distributions_dict,
     )
 
 
diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py
@@ -19,6 +19,7 @@
 from itertools import product
 
 import pytest
+
 from mock import Mock, patch
 
 from sagemaker import fw_utils
@@ -92,6 +93,46 @@ def test_tar_and_upload_dir_s3_with_kms(utils, sagemaker_session):
     obj.upload_file.assert_called_with(utils.create_tar_file(), ExtraArgs=extra_args)
 
 
+def test_mp_config_partition_exists():
+    mp_parameters = {}
+    with pytest.raises(ValueError):
+        fw_utils.validate_mp_config(mp_parameters)
+
+
+@pytest.mark.parametrize(
+    "pipeline, placement_strategy, optimize, trace_device",
+    [
+        ("simple", "spread", "speed", "cpu"),
+        ("interleaved", "cluster", "memory", "gpu"),
+        ("_only_forward", "spread", "speed", "gpu"),
+    ],
+)
+def test_mp_config_string_names(pipeline, placement_strategy, optimize, trace_device):
+    mp_parameters = {
+        "partitions": 2,
+        "pipeline": pipeline,
+        "placement_strategy": placement_strategy,
+        "optimize": optimize,
+        "trace_device": trace_device,
+    }
+    fw_utils.validate_mp_config(mp_parameters)
+
+
+def test_mp_config_auto_partition_arg():
+    mp_parameters = {}
+    mp_parameters["partitions"] = 2
+    mp_parameters["auto_partition"] = False
+    with pytest.raises(ValueError):
+        fw_utils.validate_mp_config(mp_parameters)
+
+    mp_parameters["default_partition"] = 1
+    fw_utils.validate_mp_config(mp_parameters)
+
+    mp_parameters["default_partition"] = 4
+    with pytest.raises(ValueError):
+        fw_utils.validate_mp_config(mp_parameters)
+
+
 def test_validate_source_dir_does_not_exits(sagemaker_session):
     script = "mnist.py"
     directory = " !@#$%^&*()path probably in not there.!@#$%^&*()"