feature: add spot instance support for AlgorithmEstimator (#1672)

metrizable · web-flow · commit 362436430861 · 2020-07-06T23:27:47.000-07:00
diff --git a/src/sagemaker/algorithm.py b/src/sagemaker/algorithm.py
@@ -53,6 +53,8 @@ def __init__(
         model_channel_name="model",
         metric_definitions=None,
         encrypt_inter_container_traffic=False,
+        train_use_spot_instances=False,
+        train_max_wait=None,
         **kwargs  # pylint: disable=W0613
     ):
         """Initialize an ``AlgorithmEstimator`` instance.
@@ -125,6 +127,17 @@ def __init__(
                 expression used to extract the metric from the logs.
             encrypt_inter_container_traffic (bool): Specifies whether traffic between training
                 containers is encrypted for the training job (default: ``False``).
+            train_use_spot_instances (bool): Specifies whether to use SageMaker
+                Managed Spot instances for training. If enabled then the
+                `train_max_wait` arg should also be set.
+
+                More information:
+                https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html
+                (default: ``False``).
+            train_max_wait (int): Timeout in seconds waiting for spot training
+                instances (default: None). After this amount of time Amazon
+                SageMaker will stop waiting for Spot instances to become
+                available (default: ``None``).
             **kwargs: Additional kwargs. This is unused. It's only added for AlgorithmEstimator
                 to ignore the irrelevant arguments.
         """
@@ -148,6 +161,8 @@ def __init__(
             model_channel_name=model_channel_name,
             metric_definitions=metric_definitions,
             encrypt_inter_container_traffic=encrypt_inter_container_traffic,
+            train_use_spot_instances=train_use_spot_instances,
+            train_max_wait=train_max_wait,
         )
 
         self.algorithm_spec = self.sagemaker_session.sagemaker_client.describe_algorithm(
diff --git a/tests/unit/test_algorithm.py b/tests/unit/test_algorithm.py
@@ -1015,3 +1015,18 @@ def test_algorithm_attach_from_hyperparameter_tuning():
     assert estimator.train_volume_size == train_volume_size
     assert estimator.input_mode == input_mode
     assert estimator.sagemaker_session == session
+
+
+@patch("sagemaker.Session")
+def test_algorithm_supported_with_spot_instances(session):
+    session.sagemaker_client.describe_algorithm = Mock(return_value=DESCRIBE_ALGORITHM_RESPONSE)
+
+    assert AlgorithmEstimator(
+        algorithm_arn="arn:aws:sagemaker:us-east-2:1234:algorithm/scikit-decision-trees",
+        role="SageMakerRole",
+        train_instance_type="ml.m4.xlarge",
+        train_instance_count=1,
+        train_use_spot_instances=True,
+        train_max_wait=500,
+        sagemaker_session=session,
+    )