Skip to content

Commit db6c55e

Browse files
lianyidingchuyang-deng
authored andcommitted
change: fix tests for new regions (#983)
* fix test failures related to m4, c4 and t2 region availibility. * rename test_hvd_basic.py to hvd_basic.py * use regional s3 endpoint in TF script mode tests
1 parent 75f3554 commit db6c55e

File tree

7 files changed

+44
-18
lines changed

7 files changed

+44
-18
lines changed

tests/conftest.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@
3131

3232
DEFAULT_REGION = "us-west-2"
3333

34-
NO_M4_REGIONS = ["eu-west-3", "eu-north-1", "ap-east-1"]
34+
NO_M4_REGIONS = ["eu-west-3", "eu-north-1", "ap-east-1", "sa-east-1"]
35+
36+
NO_T2_REGIONS = ["eu-north-1", "ap-east-1"]
3537

3638

3739
def pytest_addoption(parser):
@@ -262,6 +264,16 @@ def cpu_instance_type(sagemaker_session, request):
262264
return "ml.m4.xlarge"
263265

264266

267+
@pytest.fixture(scope="session")
268+
def alternative_cpu_instance_type(sagemaker_session, request):
269+
region = sagemaker_session.boto_session.region_name
270+
if region in NO_T2_REGIONS:
271+
# T3 is not supported by hosting yet
272+
return "ml.c5.xlarge"
273+
else:
274+
return "ml.t2.medium"
275+
276+
265277
@pytest.fixture(scope="session")
266278
def cpu_instance_family(cpu_instance_type):
267279
"_".join(cpu_instance_type.split(".")[0:2])

tests/integ/test_horovod.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdi
5252
output_path = "file://%s" % tmpdir
5353
job_name = sagemaker.utils.unique_name_from_base("tf-horovod")
5454
estimator = TensorFlow(
55-
entry_point=os.path.join(horovod_dir, "test_hvd_basic.py"),
55+
entry_point=os.path.join(horovod_dir, "hvd_basic.py"),
5656
role="SageMakerRole",
5757
train_instance_count=2,
5858
train_instance_type="local",
@@ -100,7 +100,7 @@ def extract_files_from_s3(s3_url, tmpdir):
100100
def __create_and_fit_estimator(sagemaker_session, instance_type, tmpdir):
101101
job_name = sagemaker.utils.unique_name_from_base("tf-horovod")
102102
estimator = TensorFlow(
103-
entry_point=os.path.join(horovod_dir, "test_hvd_basic.py"),
103+
entry_point=os.path.join(horovod_dir, "hvd_basic.py"),
104104
role="SageMakerRole",
105105
train_instance_count=2,
106106
train_instance_type=instance_type,

tests/integ/test_inference_pipeline.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,9 @@ def test_inference_pipeline_model_deploy(sagemaker_session, cpu_instance_type):
151151
assert "Could not find model" in str(exception.value)
152152

153153

154-
def test_inference_pipeline_model_deploy_with_update_endpoint(sagemaker_session):
154+
def test_inference_pipeline_model_deploy_with_update_endpoint(
155+
sagemaker_session, cpu_instance_type, alternative_cpu_instance_type
156+
):
155157
sparkml_data_path = os.path.join(DATA_DIR, "sparkml_model")
156158
xgboost_data_path = os.path.join(DATA_DIR, "xgboost_model")
157159
endpoint_name = "test-inference-pipeline-deploy-{}".format(sagemaker_timestamp())
@@ -179,13 +181,13 @@ def test_inference_pipeline_model_deploy_with_update_endpoint(sagemaker_session)
179181
role="SageMakerRole",
180182
sagemaker_session=sagemaker_session,
181183
)
182-
model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
184+
model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name)
183185
old_endpoint = sagemaker_session.sagemaker_client.describe_endpoint(
184186
EndpointName=endpoint_name
185187
)
186188
old_config_name = old_endpoint["EndpointConfigName"]
187189

188-
model.deploy(1, "ml.m4.xlarge", update_endpoint=True, endpoint_name=endpoint_name)
190+
model.deploy(1, cpu_instance_type, update_endpoint=True, endpoint_name=endpoint_name)
189191

190192
# Wait for endpoint to finish updating
191193
max_retry_count = 40 # Endpoint update takes ~7min. 40 retries * 30s sleeps = 20min timeout
@@ -207,7 +209,7 @@ def test_inference_pipeline_model_deploy_with_update_endpoint(sagemaker_session)
207209
)
208210

209211
assert old_config_name != new_config_name
210-
assert new_config["ProductionVariants"][0]["InstanceType"] == "ml.m4.xlarge"
212+
assert new_config["ProductionVariants"][0]["InstanceType"] == cpu_instance_type
211213
assert new_config["ProductionVariants"][0]["InitialInstanceCount"] == 1
212214

213215
model.delete_model()

tests/integ/test_mxnet_train.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,11 @@ def test_deploy_model_with_tags_and_kms(
151151

152152

153153
def test_deploy_model_with_update_endpoint(
154-
mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type
154+
mxnet_training_job,
155+
sagemaker_session,
156+
mxnet_full_version,
157+
cpu_instance_type,
158+
alternative_cpu_instance_type,
155159
):
156160
endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp())
157161

@@ -169,13 +173,13 @@ def test_deploy_model_with_update_endpoint(
169173
sagemaker_session=sagemaker_session,
170174
framework_version=mxnet_full_version,
171175
)
172-
model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
176+
model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name)
173177
old_endpoint = sagemaker_session.sagemaker_client.describe_endpoint(
174178
EndpointName=endpoint_name
175179
)
176180
old_config_name = old_endpoint["EndpointConfigName"]
177181

178-
model.deploy(1, "ml.m4.xlarge", update_endpoint=True, endpoint_name=endpoint_name)
182+
model.deploy(1, cpu_instance_type, update_endpoint=True, endpoint_name=endpoint_name)
179183

180184
# Wait for endpoint to finish updating
181185
max_retry_count = 40 # Endpoint update takes ~7min. 40 retries * 30s sleeps = 20min timeout
@@ -197,12 +201,16 @@ def test_deploy_model_with_update_endpoint(
197201
)
198202

199203
assert old_config_name != new_config_name
200-
assert new_config["ProductionVariants"][0]["InstanceType"] == "ml.m4.xlarge"
204+
assert new_config["ProductionVariants"][0]["InstanceType"] == cpu_instance_type
201205
assert new_config["ProductionVariants"][0]["InitialInstanceCount"] == 1
202206

203207

204208
def test_deploy_model_with_update_non_existing_endpoint(
205-
mxnet_training_job, sagemaker_session, mxnet_full_version, cpu_instance_type
209+
mxnet_training_job,
210+
sagemaker_session,
211+
mxnet_full_version,
212+
cpu_instance_type,
213+
alternative_cpu_instance_type,
206214
):
207215
endpoint_name = "test-mxnet-deploy-model-{}".format(sagemaker_timestamp())
208216
expected_error_message = (
@@ -224,7 +232,7 @@ def test_deploy_model_with_update_non_existing_endpoint(
224232
sagemaker_session=sagemaker_session,
225233
framework_version=mxnet_full_version,
226234
)
227-
model.deploy(1, "ml.t2.medium", endpoint_name=endpoint_name)
235+
model.deploy(1, alternative_cpu_instance_type, endpoint_name=endpoint_name)
228236
sagemaker_session.sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
229237

230238
with pytest.raises(ValueError, message=expected_error_message):

tests/integ/test_tf_keras.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def test_keras(sagemaker_session, cpu_instance_type):
6060

6161
endpoint_name = estimator.latest_training_job.name
6262
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
63-
predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge")
63+
predictor = estimator.deploy(initial_instance_count=1, instance_type=cpu_instance_type)
6464

6565
data = np.random.randn(32, 32, 3)
6666
predict_response = predictor.predict(data)

tests/integ/test_tf_script_mode.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,9 @@ def test_mnist(sagemaker_session, instance_type):
5757
with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
5858
estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-mnist"))
5959
_assert_s3_files_exist(
60-
estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"]
60+
estimator.model_dir,
61+
["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
62+
sagemaker_session.boto_region_name,
6163
)
6264
df = estimator.training_job_analytics.dataframe()
6365
assert df.size > 0
@@ -118,7 +120,9 @@ def test_mnist_distributed(sagemaker_session, instance_type):
118120
with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
119121
estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-distributed"))
120122
_assert_s3_files_exist(
121-
estimator.model_dir, ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"]
123+
estimator.model_dir,
124+
["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
125+
sagemaker_session.boto_region_name,
122126
)
123127

124128

@@ -196,9 +200,9 @@ def test_deploy_with_input_handlers(sagemaker_session, instance_type):
196200
assert expected_result == result
197201

198202

199-
def _assert_s3_files_exist(s3_url, files):
203+
def _assert_s3_files_exist(s3_url, files, region):
200204
parsed_url = urlparse(s3_url)
201-
s3 = boto3.client("s3")
205+
s3 = boto3.client("s3", region_name=region)
202206
contents = s3.list_objects_v2(Bucket=parsed_url.netloc, Prefix=parsed_url.path.lstrip("/"))[
203207
"Contents"
204208
]

0 commit comments

Comments
 (0)