@@ -950,6 +950,11 @@ def train( # noqa: C901
950950 }
951951 Returns:
952952 str: ARN of the training job, if it is created.
953+
954+ Raises:
955+ - botocore.exceptions.ClientError: If Sagemaker throws an exception while creating
956+ training job.
957+ - ValueError: If both image_uri and algorithm are provided, or if neither is provided.
953958 """
954959 tags = _append_project_tags (format_tags (tags ))
955960 tags = self ._append_sagemaker_config_tags (
@@ -1033,9 +1038,19 @@ def train( # noqa: C901
10331038 )
10341039
10351040 def submit (request ):
1036- logger .info ("Creating training-job with name: %s" , job_name )
1037- logger .debug ("train request: %s" , json .dumps (request , indent = 4 ))
1038- self .sagemaker_client .create_training_job (** request )
1041+ try :
1042+ logger .info ("Creating training-job with name: %s" , job_name )
1043+ logger .debug ("train request: %s" , json .dumps (request , indent = 4 ))
1044+ self .sagemaker_client .create_training_job (** request )
1045+ except Exception as e :
1046+ troubleshooting = (
1047+ "https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html"
1048+ "#sagemaker-python-sdk-troubleshooting-create-training-job"
1049+ )
1050+ logger .error (
1051+ "Please check the troubleshooting guide for common errors: %s" , troubleshooting
1052+ )
1053+ raise e
10391054
10401055 self ._intercept_create_request (train_request , submit , self .train .__name__ )
10411056
@@ -1342,6 +1357,15 @@ def update_training_job(
13421357 remote_debug_config = {
13431358 "EnableRemoteDebug": True,
13441359 }
1360+
1361+ Returns:
1362+ str: ARN of training job
1363+
1364+ Raises:
1365+ - botocore.exceptions.ClientError: If Sagemaker throws an error while updating training
1366+ job.
1367+ - botocore.exceptions.ParamValidationError: If any request parameters are in an invalid
1368+ format.
13451369 """
13461370 # No injections from sagemaker_config because the UpdateTrainingJob API's resource_config
13471371 # object accepts fewer parameters than the CreateTrainingJob API, and none that the
@@ -1356,9 +1380,28 @@ def update_training_job(
13561380 resource_config = resource_config ,
13571381 remote_debug_config = remote_debug_config ,
13581382 )
1359- logger .info ("Updating training job with name %s" , job_name )
1360- logger .debug ("Update request: %s" , json .dumps (update_training_job_request , indent = 4 ))
1361- self .sagemaker_client .update_training_job (** update_training_job_request )
1383+ try :
1384+ logger .info ("Updating training job with name %s" , job_name )
1385+ logger .debug ("Update request: %s" , json .dumps (update_training_job_request , indent = 4 ))
1386+ self .sagemaker_client .update_training_job (** update_training_job_request )
1387+ except botocore .exceptions .ParamValidationError as e :
1388+ troubleshooting = (
1389+ "Incorrect request parameter was provided. Check the API documentation: "
1390+ "https://docs.aws.amazon.com/sagemaker/latest/APIReference/"
1391+ "API_UpdateTrainingJob.html#API_UpdateTrainingJob_RequestParameters"
1392+ )
1393+ logger .error ("%s" , troubleshooting )
1394+ raise e
1395+ except botocore .exceptions .ClientError as e :
1396+ troubleshooting = (
1397+ "https://docs.aws.amazon.com/sagemaker/latest/dg/"
1398+ "sagemaker-python-sdk-troubleshooting.html"
1399+ "#sagemaker-python-sdk-troubleshooting-update-training-job"
1400+ )
1401+ logger .error (
1402+ "Please check the troubleshooting guide for common errors: %s" , troubleshooting
1403+ )
1404+ raise e
13621405
13631406 def _get_update_training_job_request (
13641407 self ,
@@ -1461,6 +1504,10 @@ def process(
14611504 * If both `ExperimentName` and `TrialName` are not supplied the trial component
14621505 will be unassociated.
14631506 * `TrialComponentDisplayName` is used for display in Studio.
1507+
1508+ Raises:
1509+ - botocore.exceptions.ClientError: If Sagemaker throws an error while creating
1510+ processing job.
14641511 """
14651512 tags = _append_project_tags (format_tags (tags ))
14661513 tags = self ._append_sagemaker_config_tags (
@@ -1524,9 +1571,20 @@ def process(
15241571 )
15251572
15261573 def submit (request ):
1527- logger .info ("Creating processing-job with name %s" , job_name )
1528- logger .debug ("process request: %s" , json .dumps (request , indent = 4 ))
1529- self .sagemaker_client .create_processing_job (** request )
1574+ try :
1575+ logger .info ("Creating processing-job with name %s" , job_name )
1576+ logger .debug ("process request: %s" , json .dumps (request , indent = 4 ))
1577+ self .sagemaker_client .create_processing_job (** request )
1578+ except Exception as e :
1579+ troubleshooting = (
1580+ "https://docs.aws.amazon.com/sagemaker/latest/dg/"
1581+ "sagemaker-python-sdk-troubleshooting.html"
1582+ "#sagemaker-python-sdk-troubleshooting-create-processing-job"
1583+ )
1584+ logger .error (
1585+ "Please check the troubleshooting guide for common errors: %s" , troubleshooting
1586+ )
1587+ raise e
15301588
15311589 self ._intercept_create_request (process_request , submit , self .process .__name__ )
15321590
@@ -4573,6 +4631,10 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live
45734631
45744632 Returns:
45754633 str: Name of the Amazon SageMaker ``Endpoint`` created.
4634+
4635+ Raises:
4636+ botocore.exceptions.ClientError: If Sagemaker throws an exception while creating
4637+ endpoint.
45764638 """
45774639 logger .info ("Creating endpoint with name %s" , endpoint_name )
45784640
@@ -4581,16 +4643,26 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live
45814643 tags = self ._append_sagemaker_config_tags (
45824644 tags , "{}.{}.{}" .format (SAGEMAKER , ENDPOINT , TAGS )
45834645 )
4584-
4585- res = self .sagemaker_client .create_endpoint (
4586- EndpointName = endpoint_name , EndpointConfigName = config_name , Tags = tags
4587- )
4588- if res :
4589- self .endpoint_arn = res ["EndpointArn" ]
4590-
4591- if wait :
4592- self .wait_for_endpoint (endpoint_name , live_logging = live_logging )
4593- return endpoint_name
4646+ try :
4647+ res = self .sagemaker_client .create_endpoint (
4648+ EndpointName = endpoint_name , EndpointConfigName = config_name , Tags = tags
4649+ )
4650+ if res :
4651+ self .endpoint_arn = res ["EndpointArn" ]
4652+
4653+ if wait :
4654+ self .wait_for_endpoint (endpoint_name , live_logging = live_logging )
4655+ return endpoint_name
4656+ except Exception as e :
4657+ troubleshooting = (
4658+ "https://docs.aws.amazon.com/sagemaker/latest/dg/"
4659+ "sagemaker-python-sdk-troubleshooting.html"
4660+ "#sagemaker-python-sdk-troubleshooting-create-endpoint"
4661+ )
4662+ logger .error (
4663+ "Please check the troubleshooting guide for common errors: %s" , troubleshooting
4664+ )
4665+ raise e
45944666
45954667 def endpoint_in_service_or_not (self , endpoint_name : str ):
45964668 """Check whether an Amazon SageMaker ``Endpoint``` is in IN_SERVICE status.
@@ -4635,7 +4707,9 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True):
46354707 str: Name of the Amazon SageMaker ``Endpoint`` being updated.
46364708
46374709 Raises:
4638- ValueError: if the endpoint does not already exist
4710+ - ValueError: if the endpoint does not already exist
4711+ - botocore.exceptions.ClientError: If SageMaker throws an error while
4712+ creating endpoint config, describing endpoint or updating endpoint
46394713 """
46404714 if not _deployment_entity_exists (
46414715 lambda : self .sagemaker_client .describe_endpoint (EndpointName = endpoint_name )
@@ -4645,15 +4719,27 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True):
46454719 "existing endpoint name" .format (endpoint_name )
46464720 )
46474721
4648- res = self .sagemaker_client .update_endpoint (
4649- EndpointName = endpoint_name , EndpointConfigName = endpoint_config_name
4650- )
4651- if res :
4652- self .endpoint_arn = res ["EndpointArn" ]
4722+ try :
46534723
4654- if wait :
4655- self .wait_for_endpoint (endpoint_name )
4656- return endpoint_name
4724+ res = self .sagemaker_client .update_endpoint (
4725+ EndpointName = endpoint_name , EndpointConfigName = endpoint_config_name
4726+ )
4727+ if res :
4728+ self .endpoint_arn = res ["EndpointArn" ]
4729+
4730+ if wait :
4731+ self .wait_for_endpoint (endpoint_name )
4732+ return endpoint_name
4733+ except Exception as e :
4734+ troubleshooting = (
4735+ "https://docs.aws.amazon.com/sagemaker/latest/dg/"
4736+ "sagemaker-python-sdk-troubleshooting.html"
4737+ "#sagemaker-python-sdk-troubleshooting-update-endpoint"
4738+ )
4739+ logger .error (
4740+ "Please check the troubleshooting guide for common errors: %s" , troubleshooting
4741+ )
4742+ raise e
46574743
46584744 def is_inference_component_based_endpoint (self , endpoint_name ):
46594745 """Returns 'True' if endpoint is inference-component-based, 'False' otherwise.
@@ -4934,7 +5020,7 @@ def update_inference_component(
49345020 return inference_component_name
49355021
49365022 def delete_inference_component (self , inference_component_name : str , wait : bool = False ):
4937- """Deletes a InferenceComponent.
5023+ """Deletes an InferenceComponent.
49385024
49395025 Args:
49405026 inference_component_name (str): Name of the Amazon SageMaker ``InferenceComponent``
@@ -8502,8 +8588,19 @@ def _check_job_status(job, desc, status_key_name):
85028588 elif status != "Completed" :
85038589 reason = desc .get ("FailureReason" , "(No reason provided)" )
85048590 job_type = status_key_name .replace ("JobStatus" , " job" )
8505- message = "Error for {job_type} {job_name}: {status}. Reason: {reason}" .format (
8506- job_type = job_type , job_name = job , status = status , reason = reason
8591+ troubleshooting = (
8592+ "https://docs.aws.amazon.com/sagemaker/latest/dg/"
8593+ "sagemaker-python-sdk-troubleshooting.html"
8594+ )
8595+ message = (
8596+ "Error for {job_type} {job_name}: {status}. Reason: {reason}. "
8597+ "Check troubleshooting guide for common errors: {troubleshooting}"
8598+ ).format (
8599+ job_type = job_type ,
8600+ job_name = job ,
8601+ status = status ,
8602+ reason = reason ,
8603+ troubleshooting = troubleshooting ,
85078604 )
85088605 if "CapacityError" in str (reason ):
85098606 raise exceptions .CapacityError (
0 commit comments