Skip to content

Commit ad50fbc

Browse files
authored
fixes for test flakyness post 0.0.4 release (#97)
Description of changes: - there is a slight delay sometimes when tags are available on resource post creation - adopted_endpoint resource was missing deleting the cr which caused false positive error message clutter in logs - Job stopping sometimes takes a longer time than timeout - minor linting and clean up Testing: PR build By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
1 parent 7d2f7a0 commit ad50fbc

12 files changed

+123
-96
lines changed

test/e2e/__init__.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,3 +313,35 @@ def assert_training_status_in_sync(training_job_name, reference, expected_status
313313
== wait_resource_training_status(reference, expected_status)
314314
== expected_status
315315
)
316+
317+
318+
def get_sagemaker_endpoint(endpoint_name: str):
319+
try:
320+
return sagemaker_client().describe_endpoint(EndpointName=endpoint_name)
321+
except botocore.exceptions.ClientError as error:
322+
logging.error(
323+
f"SageMaker could not find a endpoint with the name {endpoint_name}. Error {error}"
324+
)
325+
return None
326+
327+
328+
def get_sagemaker_model(model_name: str):
329+
try:
330+
return sagemaker_client().describe_model(ModelName=model_name)
331+
except botocore.exceptions.ClientError as error:
332+
logging.error(
333+
f"SageMaker could not find a model with the name {model_name}. Error {error}"
334+
)
335+
return None
336+
337+
338+
def get_sagemaker_endpoint_config(config_name: str):
339+
try:
340+
return sagemaker_client().describe_endpoint_config(
341+
EndpointConfigName=config_name
342+
)
343+
except botocore.exceptions.ClientError as error:
344+
logging.error(
345+
f"SageMaker could not find an endpoint config with the name {config_name}. Error {error}"
346+
)
347+
return None

test/e2e/common/config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,5 +33,7 @@
3333
DELETE_WAIT_PERIOD = 4
3434
DELETE_WAIT_LENGTH = 30
3535

36-
JOB_DELETE_WAIT_PERIODS = 8
36+
JOB_DELETE_WAIT_PERIODS = 12
3737
JOB_DELETE_WAIT_LENGTH = 30
38+
39+
TAG_DELAY_SLEEP = 20

test/e2e/service_bootstrap.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import logging
1919
import time
2020
import subprocess
21+
import random
2122

2223
from acktest import resources
2324
from acktest.aws.identity import get_region, get_account_id
@@ -48,10 +49,15 @@ def create_execution_role() -> str:
4849
Description="SageMaker execution role for ACK integration and canary tests",
4950
)
5051

52+
# random sleep to prevent throttling
53+
time.sleep(random.randrange(1, 3))
5154
iam.attach_role_policy(
5255
RoleName=role_name,
5356
PolicyArn="arn:aws:iam::aws:policy/AmazonSageMakerFullAccess",
5457
)
58+
59+
# random sleep to prevent throttling
60+
time.sleep(random.randrange(1, 3))
5561
iam.attach_role_policy(
5662
RoleName=role_name, PolicyArn="arn:aws:iam::aws:policy/AmazonS3FullAccess"
5763
)

test/e2e/tests/test_adopt_endpoint.py

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,9 @@
1313
"""Integration tests for the SageMaker Endpoint API.
1414
"""
1515

16-
import boto3
1716
import pytest
18-
import logging
19-
import time
2017
from typing import Dict
2118

22-
from acktest.aws import s3
2319
from acktest.resources import random_suffix_name
2420
from acktest.k8s import resource as k8s
2521

@@ -31,6 +27,9 @@
3127
wait_sagemaker_endpoint_status,
3228
assert_endpoint_status_in_sync,
3329
sagemaker_client,
30+
get_sagemaker_endpoint,
31+
get_sagemaker_endpoint_config,
32+
get_sagemaker_model,
3433
)
3534
from e2e.replacement_values import REPLACEMENT_VALUES
3635
from e2e.common import config as cfg
@@ -112,10 +111,16 @@ def sdk_endpoint(name_suffix):
112111
endpoint_input,
113112
endpoint_response,
114113
)
115-
wait_sagemaker_endpoint_status(endpoint_name, cfg.ENDPOINT_STATUS_INSERVICE)
116-
sagemaker_client().delete_endpoint(EndpointName=endpoint_name)
117-
sagemaker_client().delete_endpoint_config(EndpointConfigName=endpoint_config_name)
118-
sagemaker_client().delete_model(ModelName=model_name)
114+
115+
if get_sagemaker_endpoint(endpoint_name) is not None:
116+
wait_sagemaker_endpoint_status(endpoint_name, cfg.ENDPOINT_STATUS_INSERVICE)
117+
sagemaker_client().delete_endpoint(EndpointName=endpoint_name)
118+
if get_sagemaker_endpoint_config(endpoint_config_name) is not None:
119+
sagemaker_client().delete_endpoint_config(
120+
EndpointConfigName=endpoint_config_name
121+
)
122+
if get_sagemaker_model(model_name) is not None:
123+
sagemaker_client().delete_model(ModelName=model_name)
119124

120125

121126
@pytest.fixture(scope="module")
@@ -262,3 +267,9 @@ def test_smoke(self, sdk_endpoint, adopted_endpoint):
262267
endpoint_name, endpoint_reference, cfg.ENDPOINT_STATUS_INSERVICE,
263268
)
264269
assert k8s.wait_on_condition(endpoint_reference, "ACK.ResourceSynced", "True")
270+
271+
for cr in (model_reference, config_reference, endpoint_reference):
272+
_, deleted = k8s.delete_custom_resource(
273+
cr, cfg.DELETE_WAIT_PERIOD, cfg.DELETE_WAIT_LENGTH
274+
)
275+
assert deleted

test/e2e/tests/test_adopt_model_package.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -242,16 +242,8 @@ def test_smoke(self, sdk_model_package, adopted_model_package):
242242
model_package_reference, "ACK.ResourceSynced", "True"
243243
)
244244

245-
_, deleted = k8s.delete_custom_resource(
246-
model_package_reference,
247-
cfg.JOB_DELETE_WAIT_PERIODS,
248-
cfg.JOB_DELETE_WAIT_LENGTH,
249-
)
250-
assert deleted is True
251-
252-
_, deleted = k8s.delete_custom_resource(
253-
model_package_group_reference,
254-
cfg.JOB_DELETE_WAIT_PERIODS,
255-
cfg.JOB_DELETE_WAIT_LENGTH,
256-
)
257-
assert deleted is True
245+
for cr in (model_package_reference, model_package_group_reference):
246+
_, deleted = k8s.delete_custom_resource(
247+
cr, cfg.JOB_DELETE_WAIT_PERIODS, cfg.JOB_DELETE_WAIT_LENGTH
248+
)
249+
assert deleted

test/e2e/tests/test_data_quality_job_definition.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@
1616
import pytest
1717
import logging
1818
import botocore
19+
import time
1920

2021
from e2e import service_marker, assert_tags_in_sync
2122
from e2e.common.fixtures import (
2223
xgboost_churn_data_quality_job_definition,
2324
xgboost_churn_endpoint,
2425
)
26+
from e2e.common import config as cfg
2527
from acktest.k8s import resource as k8s
2628

2729
# Access variable so it is loaded as a fixture
@@ -56,6 +58,8 @@ def test_smoke(self, sagemaker_client, xgboost_churn_data_quality_job_definition
5658
job_definition_arn = job_definition_desc["JobDefinitionArn"]
5759
assert k8s.get_resource_arn(resource) == job_definition_arn
5860

61+
# random sleep before we check for tags to reduce test flakyness
62+
time.sleep(cfg.TAG_DELAY_SLEEP)
5963
resource_tags = resource["spec"].get("tags", None)
6064
assert_tags_in_sync(job_definition_arn, resource_tags)
6165
# Delete the k8s resource.

test/e2e/tests/test_endpoint.py

Lines changed: 34 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,9 @@
1313
"""Integration tests for the SageMaker Endpoint API.
1414
"""
1515

16-
import botocore
17-
from botocore import endpoint
1816
import pytest
1917
import logging
20-
import time
18+
2119
from typing import Dict
2220

2321
from acktest.aws import s3
@@ -29,13 +27,17 @@
2927
create_sagemaker_resource,
3028
assert_endpoint_status_in_sync,
3129
assert_tags_in_sync,
30+
get_sagemaker_endpoint,
3231
)
3332
from e2e.replacement_values import REPLACEMENT_VALUES
3433
from e2e.common import config as cfg
3534

3635
FAIL_UPDATE_ERROR_MESSAGE = "EndpointUpdateError: unable to update endpoint. check FailureReason. latest EndpointConfigName is "
3736
# annontation key for last endpoint config name used for update
38-
LAST_ENDPOINTCONFIG_UPDATE_ANNOTATION = "sagemaker.services.k8s.aws/last-endpoint-config-for-update"
37+
LAST_ENDPOINTCONFIG_UPDATE_ANNOTATION = (
38+
"sagemaker.services.k8s.aws/last-endpoint-config-for-update"
39+
)
40+
3941

4042
@pytest.fixture(scope="module")
4143
def name_suffix():
@@ -212,35 +214,17 @@ def faulty_config(name_suffix, single_container_model):
212214
@service_marker
213215
@pytest.mark.canary
214216
class TestEndpoint:
215-
def _get_resource_endpoint_arn(self, resource: Dict):
216-
assert (
217-
"ackResourceMetadata" in resource["status"]
218-
and "arn" in resource["status"]["ackResourceMetadata"]
219-
)
220-
return resource["status"]["ackResourceMetadata"]["arn"]
221-
222-
def _describe_sagemaker_endpoint(self, sagemaker_client, endpoint_name: str):
223-
try:
224-
return sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
225-
except botocore.exceptions.ClientError as error:
226-
logging.error(
227-
f"SageMaker could not find a endpoint with the name {endpoint_name}. Error {error}"
228-
)
229-
return None
230-
231-
def create_endpoint_test(self, sagemaker_client, xgboost_endpoint):
217+
def create_endpoint_test(self, xgboost_endpoint):
232218
(reference, resource, _) = xgboost_endpoint
233219
assert k8s.get_resource_exists(reference)
234220

235221
# endpoint has correct arn and status
236222
endpoint_name = resource["spec"].get("endpointName", None)
237223
assert endpoint_name is not None
238224

239-
endpoint_desc = self._describe_sagemaker_endpoint(
240-
sagemaker_client, endpoint_name
241-
)
225+
endpoint_desc = get_sagemaker_endpoint(endpoint_name)
242226
endpoint_arn = endpoint_desc["EndpointArn"]
243-
assert self._get_resource_endpoint_arn(resource) == endpoint_arn
227+
assert k8s.get_resource_arn(resource) == endpoint_arn
244228

245229
# endpoint transitions Creating -> InService state
246230
assert_endpoint_status_in_sync(
@@ -257,7 +241,7 @@ def create_endpoint_test(self, sagemaker_client, xgboost_endpoint):
257241
assert_tags_in_sync(endpoint_arn, resource_tags)
258242

259243
def update_endpoint_failed_test(
260-
self, sagemaker_client, single_variant_config, faulty_config, xgboost_endpoint
244+
self, single_variant_config, faulty_config, xgboost_endpoint
261245
):
262246
(endpoint_reference, _, endpoint_spec) = xgboost_endpoint
263247
(_, faulty_config_resource) = faulty_config
@@ -284,25 +268,28 @@ def update_endpoint_failed_test(
284268
)
285269

286270
assert k8s.wait_on_condition(endpoint_reference, "ACK.ResourceSynced", "False")
287-
271+
288272
(_, old_config_resource) = single_variant_config
289-
current_config_name = old_config_resource["spec"].get("endpointConfigName", None)
273+
current_config_name = old_config_resource["spec"].get(
274+
"endpointConfigName", None
275+
)
290276
assert k8s.assert_condition_state_message(
291-
endpoint_reference, "ACK.Terminal", "True", FAIL_UPDATE_ERROR_MESSAGE+current_config_name,
277+
endpoint_reference,
278+
"ACK.Terminal",
279+
"True",
280+
FAIL_UPDATE_ERROR_MESSAGE + current_config_name,
292281
)
293282

294283
endpoint_resource = k8s.get_resource(endpoint_reference)
295284
assert endpoint_resource["status"].get("failureReason", None) is not None
296285

297-
def update_endpoint_successful_test(
298-
self, sagemaker_client, multi_variant_config, xgboost_endpoint
299-
):
286+
def update_endpoint_successful_test(self, multi_variant_config, xgboost_endpoint):
300287
(endpoint_reference, endpoint_resource, endpoint_spec) = xgboost_endpoint
301288

302289
endpoint_name = endpoint_resource["spec"].get("endpointName", None)
303-
production_variants = self._describe_sagemaker_endpoint(
304-
sagemaker_client, endpoint_name
305-
)["ProductionVariants"]
290+
production_variants = get_sagemaker_endpoint(endpoint_name)[
291+
"ProductionVariants"
292+
]
306293
old_variant_instance_count = production_variants[0]["CurrentInstanceCount"]
307294
old_variant_name = production_variants[0]["VariantName"]
308295

@@ -338,9 +325,9 @@ def update_endpoint_successful_test(
338325
assert endpoint_resource["status"].get("failureReason", None) is None
339326

340327
# RetainAllVariantProperties - variant properties were retained + is a multi-variant endpoint
341-
new_production_variants = self._describe_sagemaker_endpoint(
342-
sagemaker_client, endpoint_name
343-
)["ProductionVariants"]
328+
new_production_variants = get_sagemaker_endpoint(endpoint_name)[
329+
"ProductionVariants"
330+
]
344331
assert len(new_production_variants) > 1
345332
new_variant_instance_count = None
346333
for variant in new_production_variants:
@@ -349,14 +336,16 @@ def update_endpoint_successful_test(
349336

350337
assert new_variant_instance_count == old_variant_instance_count
351338

352-
def delete_endpoint_test(self, sagemaker_client, xgboost_endpoint):
339+
def delete_endpoint_test(self, xgboost_endpoint):
353340
(reference, resource, _) = xgboost_endpoint
354341
endpoint_name = resource["spec"].get("endpointName", None)
355342

356-
_, deleted = k8s.delete_custom_resource(reference, cfg.DELETE_WAIT_PERIOD, cfg.DELETE_WAIT_LENGTH)
343+
_, deleted = k8s.delete_custom_resource(
344+
reference, cfg.DELETE_WAIT_PERIOD, cfg.DELETE_WAIT_LENGTH
345+
)
357346
assert deleted
358347

359-
assert self._describe_sagemaker_endpoint(sagemaker_client, endpoint_name) is None
348+
assert get_sagemaker_endpoint(endpoint_name) is None
360349

361350
def test_driver(
362351
self,
@@ -366,13 +355,11 @@ def test_driver(
366355
multi_variant_config,
367356
xgboost_endpoint,
368357
):
369-
self.create_endpoint_test(sagemaker_client, xgboost_endpoint)
358+
self.create_endpoint_test(xgboost_endpoint)
370359
self.update_endpoint_failed_test(
371-
sagemaker_client, single_variant_config, faulty_config, xgboost_endpoint
360+
single_variant_config, faulty_config, xgboost_endpoint
372361
)
373362
# Note: the test has been intentionally ordered to run a successful update after a failed update
374363
# check that controller updates the endpoint, removes the terminal condition and clears the failure reason
375-
self.update_endpoint_successful_test(
376-
sagemaker_client, multi_variant_config, xgboost_endpoint
377-
)
378-
self.delete_endpoint_test(sagemaker_client, xgboost_endpoint)
364+
self.update_endpoint_successful_test(multi_variant_config, xgboost_endpoint)
365+
self.delete_endpoint_test(xgboost_endpoint)

test/e2e/tests/test_endpoint_config.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,19 @@
1313
"""Integration tests for the SageMaker EndpointConfig API.
1414
"""
1515

16-
import botocore
1716
import pytest
1817
import logging
1918
from typing import Dict
19+
import time
2020

2121
from acktest.resources import random_suffix_name
2222
from acktest.k8s import resource as k8s
2323

2424
from e2e import (
2525
service_marker,
2626
create_sagemaker_resource,
27-
sagemaker_client,
2827
assert_tags_in_sync,
28+
get_sagemaker_endpoint_config,
2929
)
3030
from e2e.replacement_values import REPLACEMENT_VALUES
3131
from e2e.common import config as cfg
@@ -70,18 +70,6 @@ def single_variant_config():
7070
assert deleted
7171

7272

73-
def get_sagemaker_endpoint_config(config_name: str):
74-
try:
75-
return sagemaker_client().describe_endpoint_config(
76-
EndpointConfigName=config_name
77-
)
78-
except botocore.exceptions.ClientError as error:
79-
logging.error(
80-
f"SageMaker could not find a config with the name {config_name}. Error {error}"
81-
)
82-
return None
83-
84-
8573
@service_marker
8674
@pytest.mark.canary
8775
class TestEndpointConfig:
@@ -94,6 +82,8 @@ def test_create_endpoint_config(self, single_variant_config):
9482
endpoint_arn = endpoint_config_desc["EndpointConfigArn"]
9583
assert k8s.get_resource_arn(resource) == endpoint_arn
9684

85+
# random sleep before we check for tags to reduce test flakyness
86+
time.sleep(cfg.TAG_DELAY_SLEEP)
9787
resource_tags = resource["spec"].get("tags", None)
9888
assert_tags_in_sync(endpoint_arn, resource_tags)
9989
# Delete the k8s resource.

0 commit comments

Comments
 (0)