@@ -3223,14 +3223,11 @@ def create_model_package_from_containers(
32233223
32243224 def submit (request ):
32253225 if model_package_group_name is not None :
3226- try :
3227- self .sagemaker_client .describe_model_package_group (
3228- ModelPackageGroupName = request ["ModelPackageGroupName" ]
3229- )
3230- except ClientError :
3231- self .sagemaker_client .create_model_package_group (
3226+ _create_resource (
3227+ lambda : self .sagemaker_client .create_model_package_group (
32323228 ModelPackageGroupName = request ["ModelPackageGroupName" ]
32333229 )
3230+ )
32343231 return self .sagemaker_client .create_model_package (** request )
32353232
32363233 return self ._intercept_create_request (
@@ -3918,42 +3915,40 @@ def endpoint_from_model_data(
39183915 name = name or name_from_image (image_uri )
39193916 model_vpc_config = vpc_utils .sanitize (model_vpc_config )
39203917
3921- if _deployment_entity_exists (
3922- lambda : self .sagemaker_client .describe_endpoint (EndpointName = name )
3923- ):
3924- raise ValueError (
3925- 'Endpoint with name "{}" already exists; please pick a different name.' .format (name )
3926- )
3918+ primary_container = container_def (
3919+ image_uri = image_uri ,
3920+ model_data_url = model_s3_location ,
3921+ env = model_environment_vars ,
3922+ )
39273923
3928- if not _deployment_entity_exists (
3929- lambda : self .sagemaker_client .describe_model (ModelName = name )
3930- ):
3931- primary_container = container_def (
3932- image_uri = image_uri ,
3933- model_data_url = model_s3_location ,
3934- env = model_environment_vars ,
3935- )
3936- self .create_model (
3937- name = name , role = role , container_defs = primary_container , vpc_config = model_vpc_config
3938- )
3924+ self .create_model (
3925+ name = name , role = role , container_defs = primary_container , vpc_config = model_vpc_config
3926+ )
39393927
39403928 data_capture_config_dict = None
39413929 if data_capture_config is not None :
39423930 data_capture_config_dict = data_capture_config ._to_request_dict ()
39433931
3944- if not _deployment_entity_exists (
3945- lambda : self .sagemaker_client .describe_endpoint_config (EndpointConfigName = name )
3946- ):
3947- self .create_endpoint_config (
3932+ _create_resource (
3933+ lambda : self .create_endpoint_config (
39483934 name = name ,
39493935 model_name = name ,
39503936 initial_instance_count = initial_instance_count ,
39513937 instance_type = instance_type ,
39523938 accelerator_type = accelerator_type ,
39533939 data_capture_config_dict = data_capture_config_dict ,
39543940 )
3941+ )
3942+
3943+ # to make change backwards compatible
3944+ response = _create_resource (
3945+ lambda : self .create_endpoint (endpoint_name = name , config_name = name , wait = wait )
3946+ )
3947+ if not response :
3948+ raise ValueError (
3949+ 'Endpoint with name "{}" already exists; please pick a different name.' .format (name )
3950+ )
39553951
3956- self .create_endpoint (endpoint_name = name , config_name = name , wait = wait )
39573952 return name
39583953
39593954 def endpoint_from_production_variants (
@@ -5452,34 +5447,54 @@ def _deployment_entity_exists(describe_fn):
54525447 return False
54535448
54545449
5450+ def _create_resource (create_fn ):
5451+ """Call create function and accepts/pass when resource already exists.
5452+
5453+ This is a helper function to use an existing resource if found when creating.
5454+
5455+ Args:
5456+ create_fn: Create resource function.
5457+
5458+ Returns:
5459+ (bool): True if new resource was created, False if resource already exists.
5460+ """
5461+ try :
5462+ create_fn ()
5463+ # create function succeeded, resource does not exist already
5464+ return True
5465+ except ClientError as ce :
5466+ error_code = ce .response ["Error" ]["Code" ]
5467+ error_message = ce .response ["Error" ]["Message" ]
5468+ already_exists_exceptions = ["ValidationException" , "ResourceInUse" ]
5469+ already_exists_msg_patterns = ["Cannot create already existing" , "already exists" ]
5470+ if not (
5471+ error_code in already_exists_exceptions
5472+ and any (p in error_message for p in already_exists_msg_patterns )
5473+ ):
5474+ raise ce
5475+ # no new resource created as resource already exists
5476+ return False
5477+
5478+
54555479def _train_done (sagemaker_client , job_name , last_desc ):
54565480 """Placeholder docstring"""
54575481 in_progress_statuses = ["InProgress" , "Created" ]
54585482
5459- for _ in retries (
5460- max_retry_count = 10 , # 10*30 = 5min
5461- exception_message_prefix = "Waiting for schedule to leave 'Pending' status" ,
5462- seconds_to_sleep = 30 ,
5463- ):
5464- try :
5465- desc = sagemaker_client .describe_training_job (TrainingJobName = job_name )
5466- status = desc ["TrainingJobStatus" ]
5483+ desc = sagemaker_client .describe_training_job (TrainingJobName = job_name )
5484+ status = desc ["TrainingJobStatus" ]
54675485
5468- if secondary_training_status_changed (desc , last_desc ):
5469- print ()
5470- print (secondary_training_status_message (desc , last_desc ), end = "" )
5471- else :
5472- print ("." , end = "" )
5473- sys .stdout .flush ()
5486+ if secondary_training_status_changed (desc , last_desc ):
5487+ print ()
5488+ print (secondary_training_status_message (desc , last_desc ), end = "" )
5489+ else :
5490+ print ("." , end = "" )
5491+ sys .stdout .flush ()
54745492
5475- if status in in_progress_statuses :
5476- return desc , False
5493+ if status in in_progress_statuses :
5494+ return desc , False
54775495
5478- print ()
5479- return desc , True
5480- except botocore .exceptions .ClientError as err :
5481- if err .response ["Error" ]["Code" ] == "AccessDeniedException" :
5482- pass
5496+ print ()
5497+ return desc , True
54835498
54845499
54855500def _processing_job_status (sagemaker_client , job_name ):
@@ -5799,19 +5814,54 @@ def _deploy_done(sagemaker_client, endpoint_name):
57995814
58005815def _wait_until_training_done (callable_fn , desc , poll = 5 ):
58015816 """Placeholder docstring"""
5802- job_desc , finished = callable_fn (desc )
5817+ elapsed_time = 0
5818+ finished = None
5819+ job_desc = desc
58035820 while not finished :
5804- time .sleep (poll )
5805- job_desc , finished = callable_fn (job_desc )
5821+ try :
5822+ elapsed_time += poll
5823+ time .sleep (poll )
5824+ job_desc , finished = callable_fn (job_desc )
5825+ except botocore .exceptions .ClientError as err :
5826+ # For initial 5 mins we accept/pass AccessDeniedException.
5827+ # The reason is to await tag propagation to avoid false AccessDenied claims for an
5828+ # access policy based on resource tags, The caveat here is for true AccessDenied
5829+ # cases the routine will fail after 5 mins
5830+ if err .response ["Error" ]["Code" ] == "AccessDeniedException" and elapsed_time <= 300 :
5831+ LOGGER .warning (
5832+ "Received AccessDeniedException. This could mean the IAM role does not "
5833+ "have the resource permissions, in which case please add resource access "
5834+ "and retry. For cases where the role has tag based resource policy, "
5835+ "continuing to wait for tag propagation.."
5836+ )
5837+ continue
5838+ raise err
58065839 return job_desc
58075840
58085841
58095842def _wait_until (callable_fn , poll = 5 ):
58105843 """Placeholder docstring"""
5811- result = callable_fn ()
5844+ elapsed_time = 0
5845+ result = None
58125846 while result is None :
5813- time .sleep (poll )
5814- result = callable_fn ()
5847+ try :
5848+ elapsed_time += poll
5849+ time .sleep (poll )
5850+ result = callable_fn ()
5851+ except botocore .exceptions .ClientError as err :
5852+ # For initial 5 mins we accept/pass AccessDeniedException.
5853+ # The reason is to await tag propagation to avoid false AccessDenied claims for an
5854+ # access policy based on resource tags, The caveat here is for true AccessDenied
5855+ # cases the routine will fail after 5 mins
5856+ if err .response ["Error" ]["Code" ] == "AccessDeniedException" and elapsed_time <= 300 :
5857+ LOGGER .warning (
5858+ "Received AccessDeniedException. This could mean the IAM role does not "
5859+ "have the resource permissions, in which case please add resource access "
5860+ "and retry. For cases where the role has tag based resource policy, "
5861+ "continuing to wait for tag propagation.."
5862+ )
5863+ continue
5864+ raise err
58155865 return result
58165866
58175867
0 commit comments