Support VPC config for hyperparameter tuning and bump version to 1.17.2 (#598)

laurenyu · web-flow · commit bae98db69c17 · 2019-01-11T19:12:35.000-05:00
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,6 +2,11 @@
 CHANGELOG
 =========
 
+1.17.2
+======
+
+* feature: HyperparameterTuner: support VPC config
+
 1.17.1
 ======
 
diff --git a/doc/conf.py b/doc/conf.py
@@ -32,7 +32,7 @@ def __getattr__(cls, name):
                 'numpy', 'scipy', 'scipy.sparse']
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 
-version = '1.17.1'
+version = '1.17.2'
 project = u'sagemaker'
 
 # Add any Sphinx extension module names here, as strings. They can be extensions
diff --git a/src/sagemaker/__init__.py b/src/sagemaker/__init__.py
@@ -39,4 +39,4 @@
 from sagemaker.session import s3_input  # noqa: F401
 from sagemaker.session import get_execution_role  # noqa: F401
 
-__version__ = '1.17.1'
+__version__ = '1.17.2'
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -358,7 +358,7 @@ def tune(self, job_name, strategy, objective_type, objective_metric_name,
              static_hyperparameters, input_mode, metric_definitions,
              role, input_config, output_config, resource_config, stop_condition, tags,
              warm_start_config, enable_network_isolation=False, image=None, algorithm_arn=None,
-             early_stopping_type='Off', encrypt_inter_container_traffic=False):
+             early_stopping_type='Off', encrypt_inter_container_traffic=False, vpc_config=None):
         """Create an Amazon SageMaker hyperparameter tuning job
 
         Args:
@@ -408,8 +408,14 @@ def tune(self, job_name, strategy, objective_type, objective_metric_name,
                 Can be either 'Auto' or 'Off'. If set to 'Off', early stopping will not be attempted.
                 If set to 'Auto', early stopping of some training jobs may happen, but is not guaranteed to.
             encrypt_inter_container_traffic (bool): Specifies whether traffic between training containers
-                is encrypted for the training jobs started for this hyperparameter tuning job. Set to ``False``
-                by default.
+                is encrypted for the training jobs started for this hyperparameter tuning job (default: ``False``).
+            vpc_config (dict): Contains values for VpcConfig (default: None):
+
+                * subnets (list[str]): List of subnet ids.
+                    The key in vpc_config is 'Subnets'.
+                * security_group_ids (list[str]): List of security group ids.
+                    The key in vpc_config is 'SecurityGroupIds'.
+
         """
         tune_request = {
             'HyperParameterTuningJobName': job_name,
@@ -457,6 +463,9 @@ def tune(self, job_name, strategy, objective_type, objective_metric_name,
         if tags is not None:
             tune_request['Tags'] = tags
 
+        if vpc_config is not None:
+            tune_request['TrainingJobDefinition']['VpcConfig'] = vpc_config
+
         if enable_network_isolation:
             tune_request['TrainingJobDefinition']['EnableNetworkIsolation'] = True
 
diff --git a/src/sagemaker/tuner.py b/src/sagemaker/tuner.py
@@ -634,7 +634,6 @@ def start_new(cls, tuner, inputs):
         tuner_args['warm_start_config'] = warm_start_config_req
         tuner_args['early_stopping_type'] = tuner.early_stopping_type
 
-        del tuner_args['vpc_config']
         if isinstance(tuner.estimator, sagemaker.algorithm.AlgorithmEstimator):
             tuner_args['algorithm_arn'] = tuner.estimator.algorithm_arn
         else:
diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py
@@ -25,6 +25,7 @@
 from tests.integ import DATA_DIR, PYTHON_VERSION, TUNING_DEFAULT_TIMEOUT_MINUTES
 from tests.integ.record_set import prepare_record_set_from_local_files
 from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
+from tests.integ import vpc_test_utils
 
 from sagemaker import KMeans, LDA, RandomCutForest
 from sagemaker.amazon.amazon_estimator import registry
@@ -491,6 +492,52 @@ def test_tuning_tf(sagemaker_session):
         assert dict_result == list_result
 
 
+@pytest.mark.skipif(PYTHON_VERSION != 'py2', reason="TensorFlow image supports only python 2.")
+def test_tuning_tf_vpc_multi(sagemaker_session):
+    """Test Tensorflow multi-instance using the same VpcConfig for training and inference"""
+    instance_type = 'ml.c4.xlarge'
+    instance_count = 2
+
+    script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')
+
+    ec2_client = sagemaker_session.boto_session.client('ec2')
+    subnet_ids, security_group_id = vpc_test_utils.get_or_create_vpc_resources(ec2_client,
+                                                                               sagemaker_session.boto_region_name)
+    vpc_test_utils.setup_security_group_for_encryption(ec2_client, security_group_id)
+
+    estimator = TensorFlow(entry_point=script_path,
+                           role='SageMakerRole',
+                           training_steps=1,
+                           evaluation_steps=1,
+                           hyperparameters={'input_tensor_name': 'inputs'},
+                           train_instance_count=instance_count,
+                           train_instance_type=instance_type,
+                           sagemaker_session=sagemaker_session,
+                           base_job_name='test-vpc-tf',
+                           subnets=subnet_ids,
+                           security_group_ids=[security_group_id],
+                           encrypt_inter_container_traffic=True)
+
+    inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
+    hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.05, 0.2)}
+
+    objective_metric_name = 'loss'
+    metric_definitions = [{'Name': 'loss', 'Regex': 'loss = ([0-9\\.]+)'}]
+
+    tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges,
+                                metric_definitions,
+                                objective_type='Minimize', max_jobs=2, max_parallel_jobs=2)
+
+    tuning_job_name = unique_name_from_base('tune-tf', max_length=32)
+    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
+        tuner.fit(inputs, job_name=tuning_job_name)
+
+        print('Started hyperparameter tuning job with name:' + tuning_job_name)
+
+        time.sleep(15)
+        tuner.wait()
+
+
 @pytest.mark.continuous_testing
 def test_tuning_chainer(sagemaker_session):
     with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
diff --git a/tests/unit/test_tuner.py b/tests/unit/test_tuner.py
@@ -256,6 +256,22 @@ def test_fit_pca_with_early_stopping(sagemaker_session, tuner):
     assert tune_kwargs['early_stopping_type'] == 'Auto'
 
 
+def test_fit_mxnet_with_vpc_config(sagemaker_session, tuner):
+    subnets = ['foo']
+    security_group_ids = ['bar']
+
+    pca = PCA(ROLE, TRAIN_INSTANCE_COUNT, TRAIN_INSTANCE_TYPE, NUM_COMPONENTS,
+              base_job_name='pca', sagemaker_session=sagemaker_session,
+              subnets=subnets, security_group_ids=security_group_ids)
+    tuner.estimator = pca
+
+    records = RecordSet(s3_data=INPUTS, num_records=1, feature_dim=1)
+    tuner.fit(records, mini_batch_size=9999)
+
+    _, _, tune_kwargs = sagemaker_session.tune.mock_calls[0]
+    assert tune_kwargs['vpc_config'] == {'Subnets': subnets, 'SecurityGroupIds': security_group_ids}
+
+
 def test_fit_pca_with_inter_container_traffic_encryption_flag(sagemaker_session, tuner):
     pca = PCA(ROLE, TRAIN_INSTANCE_COUNT, TRAIN_INSTANCE_TYPE, NUM_COMPONENTS,
               base_job_name='pca', sagemaker_session=sagemaker_session,