1414
1515import base64
1616import os
17+ import requests
1718
19+ import botocore
1820import docker
1921import numpy
2022import pytest
3032from tests .integ .retry import retries
3133from tests .integ .timeout import timeout , timeout_and_delete_endpoint_by_name
3234
33- ALGORITHM_NAME = "sagemaker-multimodel-integ-test"
3435ROLE = "SageMakerRole"
3536PRETRAINED_MODEL_PATH_1 = "customer_a/dummy_model.tar.gz"
3637PRETRAINED_MODEL_PATH_2 = "customer_b/dummy_model.tar.gz"
@@ -47,27 +48,36 @@ def container_image(sagemaker_session):
4748 "sts" , region_name = region , endpoint_url = utils .sts_regional_endpoint (region )
4849 )
4950 account_id = sts_client .get_caller_identity ()["Account" ]
51+ algorithm_name = "sagemaker-multimodel-integ-test-{}" .format (sagemaker_timestamp ())
5052 ecr_image = "{account}.dkr.ecr.{region}.amazonaws.com/{algorithm_name}:latest" .format (
51- account = account_id , region = region , algorithm_name = ALGORITHM_NAME
53+ account = account_id , region = region , algorithm_name = algorithm_name
5254 )
5355
5456 # Build and tag docker image locally
5557 docker_client = docker .from_env ()
5658 image , build_log = docker_client .images .build (
57- path = os .path .join (DATA_DIR , "multimodel" , "container" ), tag = ALGORITHM_NAME , rm = True
59+ path = os .path .join (DATA_DIR , "multimodel" , "container" ), tag = algorithm_name , rm = True
5860 )
5961 image .tag (ecr_image , tag = "latest" )
6062
6163 # Create AWS ECR and push the local docker image to it
62- _create_repository (ecr_client , ALGORITHM_NAME )
64+ _create_repository (ecr_client , algorithm_name )
6365 username , password = _ecr_login (ecr_client )
64- docker_client .images .push (ecr_image , auth_config = {"username" : username , "password" : password })
66+ # Retry docker image push
67+ for _ in retries (3 , "Upload docker image to ECR repo" , seconds_to_sleep = 10 ):
68+ try :
69+ docker_client .images .push (
70+ ecr_image , auth_config = {"username" : username , "password" : password }
71+ )
72+ break
73+ except requests .exceptions .ConnectionError :
74+ # This can happen when we try to create multiple repositories in parallel, so we retry
75+ pass
76+
6577 yield ecr_image
6678
6779 # Delete repository after the multi model integration tests complete
68- repo = ecr_client .describe_repositories (repositoryNames = [ALGORITHM_NAME ])
69- if "repositories" in repo :
70- ecr_client .delete_repository (repositoryName = ALGORITHM_NAME , force = True )
80+ _delete_repository (ecr_client , algorithm_name )
7181
7282
7383def _create_repository (ecr_client , repository_name ):
@@ -87,6 +97,18 @@ def _create_repository(ecr_client, repository_name):
8797 raise
8898
8999
100+ def _delete_repository (ecr_client , repository_name ):
101+ """
102+ Deletes an ECS Repository (ECR). After the integration test completes
103+ we will remove the repository created during setup
104+ """
105+ try :
106+ ecr_client .describe_repositories (repositoryNames = [repository_name ])
107+ ecr_client .delete_repository (repositoryName = repository_name , force = True )
108+ except botocore .errorfactory .ResourceNotFoundException :
109+ pass
110+
111+
90112def _ecr_login (ecr_client ):
91113 """ Get a login credentials for an ecr client.
92114 """
0 commit comments