From fb7b64d01f532b57cc0496c590bff8d55113ab34 Mon Sep 17 00:00:00 2001 From: adishaa Date: Tue, 20 Aug 2024 15:14:16 -0700 Subject: [PATCH] change: Add troubleshooting links to exceptions --- .pylintrc | 2 +- src/sagemaker/algorithm.py | 14 +++ src/sagemaker/base_predictor.py | 2 + src/sagemaker/estimator.py | 32 ++++++- src/sagemaker/local/entities.py | 12 ++- src/sagemaker/session.py | 159 +++++++++++++++++++++++++------- 6 files changed, 181 insertions(+), 40 deletions(-) diff --git a/.pylintrc b/.pylintrc index 11e2ababa9..dd7c59831e 100644 --- a/.pylintrc +++ b/.pylintrc @@ -384,7 +384,7 @@ max-returns=6 max-branches=12 # Maximum number of statements in function / method body -max-statements=100 +max-statements=105 # Maximum number of parents for a class (see R0901). max-parents=7 diff --git a/src/sagemaker/algorithm.py b/src/sagemaker/algorithm.py index 51c93c4986..f3fd2c954e 100644 --- a/src/sagemaker/algorithm.py +++ b/src/sagemaker/algorithm.py @@ -157,6 +157,20 @@ def __init__( available (default: ``None``). **kwargs: Additional kwargs. This is unused. It's only added for AlgorithmEstimator to ignore the irrelevant arguments. + + Raises: + ValueError: + - If an AWS IAM Role is not provided. + - Bad value for instance type. + RuntimeError: + - When setting up custom VPC, both subnets and security_group_ids are not provided + - If instance_count > 1 (distributed training) with instance type local or local gpu + - If LocalSession is not used with instance type local or local gpu + - file:// output path used outside of local mode + botocore.exceptions.ClientError: + - algorithm arn is incorrect + - insufficient permission to access/ describe algorithm + - algorithm is in a different region """ self.algorithm_arn = algorithm_arn super(AlgorithmEstimator, self).__init__( diff --git a/src/sagemaker/base_predictor.py b/src/sagemaker/base_predictor.py index 1a7eea9cd7..a9b2cb021d 100644 --- a/src/sagemaker/base_predictor.py +++ b/src/sagemaker/base_predictor.py @@ -430,6 +430,8 @@ def update_endpoint( - If ``initial_instance_count``, ``instance_type``, or ``accelerator_type`` is specified and either ``model_name`` is ``None`` or there are multiple models associated with the endpoint. + botocore.exceptions.ClientError: If SageMaker throws an error while creating + endpoint config, describing endpoint or updating endpoint """ production_variants = None current_model_names = self._get_model_names() diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 66b746b1b0..6f02fde8e8 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -590,25 +590,36 @@ def __init__( self.dependencies = dependencies or [] self.uploaded_code: Optional[UploadedCode] = None - # Check that the user properly sets both subnet and secutiry_groupe_ids + # Check that the user properly sets both subnet and security_group_ids if ( subnets is not None and security_group_ids is None or security_group_ids is not None and subnets is None ): + troubleshooting = ( + "Refer to this documentation on using custom VPC: " + "https://sagemaker.readthedocs.io/en/v2.24.0/overview.html" + "#secure-training-and-inference-with-vpc" + ) + logger.error("Check troubleshooting guide for common errors: %s", troubleshooting) + raise RuntimeError( "When setting up custom VPC, both subnets and security_group_ids must be set" ) if self.instance_type in ("local", "local_gpu"): if self.instance_type == "local_gpu" and self.instance_count > 1: - raise RuntimeError("Distributed Training in Local GPU is not supported") + raise RuntimeError( + "Distributed Training in Local GPU is not supported." + " Set instance_count to 1." + ) self.sagemaker_session = sagemaker_session or LocalSession() if not isinstance(self.sagemaker_session, sagemaker.local.LocalSession): raise RuntimeError( "instance_type local or local_gpu is only supported with an" - "instance of LocalSession" + "instance of LocalSession. More details on local mode: " + "https://sagemaker.readthedocs.io/en/stable/overview.html#local-mode" ) else: self.sagemaker_session = sagemaker_session or Session() @@ -631,7 +642,11 @@ def __init__( and not is_pipeline_variable(output_path) and output_path.startswith("file://") ): - raise RuntimeError("file:// output paths are only supported in Local Mode") + raise RuntimeError( + "The 'file://' output paths are only supported when using Local Mode. " + "To resolve this issue, ensure you're running in Local Mode with a LocalSession, " + "or use an 's3://' output path for jobs running on SageMaker instances." + ) self.output_path = output_path self.latest_training_job = None self.jobs = [] @@ -646,7 +661,12 @@ def __init__( # Now we marked that as Optional because we can fetch it from SageMakerConfig # Because of marking that parameter as optional, we should validate if it is None, even # after fetching the config. - raise ValueError("An AWS IAM role is required to create an estimator.") + raise ValueError( + "An AWS IAM role is required to create an estimator. " + "Please provide a valid `role` argument with the ARN of an IAM role" + " that has the necessary SageMaker permissions." + ) + self.output_kms_key = resolve_value_from_config( output_kms_key, TRAINING_JOB_KMS_KEY_ID_PATH, sagemaker_session=self.sagemaker_session ) @@ -1855,6 +1875,8 @@ def model_data(self): if compression_type not in {"GZIP", "NONE"}: raise ValueError( f'Unrecognized training job output data compression type "{compression_type}"' + '. Please specify either "GZIP" or "NONE" as valid options for ' + "the compression type." ) # model data is in uncompressed form NOTE SageMaker Hosting mandates presence of # trailing forward slash in S3 model data URI, so append one if necessary. diff --git a/src/sagemaker/local/entities.py b/src/sagemaker/local/entities.py index 2ce37f68bd..a21a375f54 100644 --- a/src/sagemaker/local/entities.py +++ b/src/sagemaker/local/entities.py @@ -213,6 +213,10 @@ def start(self, input_data_config, output_data_config, hyperparameters, environm hyperparameters (dict): The HyperParameters for the training job. environment (dict): The collection of environment variables passed to the job. job_name (str): Name of the local training job being run. + + Raises: + ValueError: If the input data configuration is not valid. + RuntimeError: If the data distribution type is not supported. """ for channel in input_data_config: if channel["DataSource"] and "S3DataSource" in channel["DataSource"]: @@ -233,10 +237,12 @@ def start(self, input_data_config, output_data_config, hyperparameters, environm # use a single Data URI - this makes handling S3 and File Data easier down the stack channel["DataUri"] = data_uri - if data_distribution and data_distribution != "FullyReplicated": + supported_distributions = ["FullyReplicated"] + if data_distribution and data_distribution not in supported_distributions: raise RuntimeError( - "DataDistribution: %s is not currently supported in Local Mode" - % data_distribution + "Invalid DataDistribution: '{}'. Local mode currently supports: {}.".format( + data_distribution, ", ".join(supported_distributions) + ) ) self.start_time = datetime.datetime.now() diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 7035d9547d..b10a809259 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -950,6 +950,11 @@ def train( # noqa: C901 } Returns: str: ARN of the training job, if it is created. + + Raises: + - botocore.exceptions.ClientError: If Sagemaker throws an exception while creating + training job. + - ValueError: If both image_uri and algorithm are provided, or if neither is provided. """ tags = _append_project_tags(format_tags(tags)) tags = self._append_sagemaker_config_tags( @@ -1033,9 +1038,19 @@ def train( # noqa: C901 ) def submit(request): - logger.info("Creating training-job with name: %s", job_name) - logger.debug("train request: %s", json.dumps(request, indent=4)) - self.sagemaker_client.create_training_job(**request) + try: + logger.info("Creating training-job with name: %s", job_name) + logger.debug("train request: %s", json.dumps(request, indent=4)) + self.sagemaker_client.create_training_job(**request) + except Exception as e: + troubleshooting = ( + "https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html" + "#sagemaker-python-sdk-troubleshooting-create-training-job" + ) + logger.error( + "Please check the troubleshooting guide for common errors: %s", troubleshooting + ) + raise e self._intercept_create_request(train_request, submit, self.train.__name__) @@ -1342,6 +1357,15 @@ def update_training_job( remote_debug_config = { "EnableRemoteDebug": True, } + + Returns: + str: ARN of training job + + Raises: + - botocore.exceptions.ClientError: If Sagemaker throws an error while updating training + job. + - botocore.exceptions.ParamValidationError: If any request parameters are in an invalid + format. """ # No injections from sagemaker_config because the UpdateTrainingJob API's resource_config # object accepts fewer parameters than the CreateTrainingJob API, and none that the @@ -1356,9 +1380,28 @@ def update_training_job( resource_config=resource_config, remote_debug_config=remote_debug_config, ) - logger.info("Updating training job with name %s", job_name) - logger.debug("Update request: %s", json.dumps(update_training_job_request, indent=4)) - self.sagemaker_client.update_training_job(**update_training_job_request) + try: + logger.info("Updating training job with name %s", job_name) + logger.debug("Update request: %s", json.dumps(update_training_job_request, indent=4)) + self.sagemaker_client.update_training_job(**update_training_job_request) + except botocore.exceptions.ParamValidationError as e: + troubleshooting = ( + "Incorrect request parameter was provided. Check the API documentation: " + "https://docs.aws.amazon.com/sagemaker/latest/APIReference/" + "API_UpdateTrainingJob.html#API_UpdateTrainingJob_RequestParameters" + ) + logger.error("%s", troubleshooting) + raise e + except botocore.exceptions.ClientError as e: + troubleshooting = ( + "https://docs.aws.amazon.com/sagemaker/latest/dg/" + "sagemaker-python-sdk-troubleshooting.html" + "#sagemaker-python-sdk-troubleshooting-update-training-job" + ) + logger.error( + "Please check the troubleshooting guide for common errors: %s", troubleshooting + ) + raise e def _get_update_training_job_request( self, @@ -1461,6 +1504,10 @@ def process( * If both `ExperimentName` and `TrialName` are not supplied the trial component will be unassociated. * `TrialComponentDisplayName` is used for display in Studio. + + Raises: + - botocore.exceptions.ClientError: If Sagemaker throws an error while creating + processing job. """ tags = _append_project_tags(format_tags(tags)) tags = self._append_sagemaker_config_tags( @@ -1524,9 +1571,20 @@ def process( ) def submit(request): - logger.info("Creating processing-job with name %s", job_name) - logger.debug("process request: %s", json.dumps(request, indent=4)) - self.sagemaker_client.create_processing_job(**request) + try: + logger.info("Creating processing-job with name %s", job_name) + logger.debug("process request: %s", json.dumps(request, indent=4)) + self.sagemaker_client.create_processing_job(**request) + except Exception as e: + troubleshooting = ( + "https://docs.aws.amazon.com/sagemaker/latest/dg/" + "sagemaker-python-sdk-troubleshooting.html" + "#sagemaker-python-sdk-troubleshooting-create-processing-job" + ) + logger.error( + "Please check the troubleshooting guide for common errors: %s", troubleshooting + ) + raise e self._intercept_create_request(process_request, submit, self.process.__name__) @@ -4573,6 +4631,10 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live Returns: str: Name of the Amazon SageMaker ``Endpoint`` created. + + Raises: + botocore.exceptions.ClientError: If Sagemaker throws an exception while creating + endpoint. """ logger.info("Creating endpoint with name %s", endpoint_name) @@ -4581,16 +4643,26 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live tags = self._append_sagemaker_config_tags( tags, "{}.{}.{}".format(SAGEMAKER, ENDPOINT, TAGS) ) - - res = self.sagemaker_client.create_endpoint( - EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=tags - ) - if res: - self.endpoint_arn = res["EndpointArn"] - - if wait: - self.wait_for_endpoint(endpoint_name, live_logging=live_logging) - return endpoint_name + try: + res = self.sagemaker_client.create_endpoint( + EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=tags + ) + if res: + self.endpoint_arn = res["EndpointArn"] + + if wait: + self.wait_for_endpoint(endpoint_name, live_logging=live_logging) + return endpoint_name + except Exception as e: + troubleshooting = ( + "https://docs.aws.amazon.com/sagemaker/latest/dg/" + "sagemaker-python-sdk-troubleshooting.html" + "#sagemaker-python-sdk-troubleshooting-create-endpoint" + ) + logger.error( + "Please check the troubleshooting guide for common errors: %s", troubleshooting + ) + raise e def endpoint_in_service_or_not(self, endpoint_name: str): """Check whether an Amazon SageMaker ``Endpoint``` is in IN_SERVICE status. @@ -4635,7 +4707,9 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True): str: Name of the Amazon SageMaker ``Endpoint`` being updated. Raises: - ValueError: if the endpoint does not already exist + - ValueError: if the endpoint does not already exist + - botocore.exceptions.ClientError: If SageMaker throws an error while + creating endpoint config, describing endpoint or updating endpoint """ if not _deployment_entity_exists( lambda: self.sagemaker_client.describe_endpoint(EndpointName=endpoint_name) @@ -4645,15 +4719,27 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True): "existing endpoint name".format(endpoint_name) ) - res = self.sagemaker_client.update_endpoint( - EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name - ) - if res: - self.endpoint_arn = res["EndpointArn"] + try: - if wait: - self.wait_for_endpoint(endpoint_name) - return endpoint_name + res = self.sagemaker_client.update_endpoint( + EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name + ) + if res: + self.endpoint_arn = res["EndpointArn"] + + if wait: + self.wait_for_endpoint(endpoint_name) + return endpoint_name + except Exception as e: + troubleshooting = ( + "https://docs.aws.amazon.com/sagemaker/latest/dg/" + "sagemaker-python-sdk-troubleshooting.html" + "#sagemaker-python-sdk-troubleshooting-update-endpoint" + ) + logger.error( + "Please check the troubleshooting guide for common errors: %s", troubleshooting + ) + raise e def is_inference_component_based_endpoint(self, endpoint_name): """Returns 'True' if endpoint is inference-component-based, 'False' otherwise. @@ -4934,7 +5020,7 @@ def update_inference_component( return inference_component_name def delete_inference_component(self, inference_component_name: str, wait: bool = False): - """Deletes a InferenceComponent. + """Deletes an InferenceComponent. Args: inference_component_name (str): Name of the Amazon SageMaker ``InferenceComponent`` @@ -8502,8 +8588,19 @@ def _check_job_status(job, desc, status_key_name): elif status != "Completed": reason = desc.get("FailureReason", "(No reason provided)") job_type = status_key_name.replace("JobStatus", " job") - message = "Error for {job_type} {job_name}: {status}. Reason: {reason}".format( - job_type=job_type, job_name=job, status=status, reason=reason + troubleshooting = ( + "https://docs.aws.amazon.com/sagemaker/latest/dg/" + "sagemaker-python-sdk-troubleshooting.html" + ) + message = ( + "Error for {job_type} {job_name}: {status}. Reason: {reason}. " + "Check troubleshooting guide for common errors: {troubleshooting}" + ).format( + job_type=job_type, + job_name=job, + status=status, + reason=reason, + troubleshooting=troubleshooting, ) if "CapacityError" in str(reason): raise exceptions.CapacityError(