From fb7b64d01f532b57cc0496c590bff8d55113ab34 Mon Sep 17 00:00:00 2001
From: adishaa <adishaa@amazon.com>
Date: Tue, 20 Aug 2024 15:14:16 -0700
Subject: [PATCH] change: Add troubleshooting links to exceptions

---
 .pylintrc                       |   2 +-
 src/sagemaker/algorithm.py      |  14 +++
 src/sagemaker/base_predictor.py |   2 +
 src/sagemaker/estimator.py      |  32 ++++++-
 src/sagemaker/local/entities.py |  12 ++-
 src/sagemaker/session.py        | 159 +++++++++++++++++++++++++-------
 6 files changed, 181 insertions(+), 40 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index 11e2ababa9..dd7c59831e 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -384,7 +384,7 @@ max-returns=6
 max-branches=12
 
 # Maximum number of statements in function / method body
-max-statements=100
+max-statements=105
 
 # Maximum number of parents for a class (see R0901).
 max-parents=7
diff --git a/src/sagemaker/algorithm.py b/src/sagemaker/algorithm.py
index 51c93c4986..f3fd2c954e 100644
--- a/src/sagemaker/algorithm.py
+++ b/src/sagemaker/algorithm.py
@@ -157,6 +157,20 @@ def __init__(
                 available (default: ``None``).
             **kwargs: Additional kwargs. This is unused. It's only added for AlgorithmEstimator
                 to ignore the irrelevant arguments.
+
+        Raises:
+            ValueError:
+            - If an AWS IAM Role is not provided.
+            - Bad value for instance type.
+            RuntimeError:
+            - When setting up custom VPC, both subnets and security_group_ids are not provided
+            - If instance_count > 1 (distributed training) with instance type local or local gpu
+            - If LocalSession is not used with instance type local or local gpu
+            - file:// output path used outside of local mode
+            botocore.exceptions.ClientError:
+            - algorithm arn is incorrect
+            - insufficient permission to access/ describe algorithm
+            - algorithm is in a different region
         """
         self.algorithm_arn = algorithm_arn
         super(AlgorithmEstimator, self).__init__(
diff --git a/src/sagemaker/base_predictor.py b/src/sagemaker/base_predictor.py
index 1a7eea9cd7..a9b2cb021d 100644
--- a/src/sagemaker/base_predictor.py
+++ b/src/sagemaker/base_predictor.py
@@ -430,6 +430,8 @@ def update_endpoint(
                 - If ``initial_instance_count``, ``instance_type``, or ``accelerator_type`` is
                   specified and either ``model_name`` is ``None`` or there are multiple models
                   associated with the endpoint.
+            botocore.exceptions.ClientError: If SageMaker throws an error while creating
+            endpoint config, describing endpoint or updating endpoint
         """
         production_variants = None
         current_model_names = self._get_model_names()
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
index 66b746b1b0..6f02fde8e8 100644
--- a/src/sagemaker/estimator.py
+++ b/src/sagemaker/estimator.py
@@ -590,25 +590,36 @@ def __init__(
         self.dependencies = dependencies or []
         self.uploaded_code: Optional[UploadedCode] = None
 
-        # Check that the user properly sets both subnet and secutiry_groupe_ids
+        # Check that the user properly sets both subnet and security_group_ids
         if (
             subnets is not None
             and security_group_ids is None
             or security_group_ids is not None
             and subnets is None
         ):
+            troubleshooting = (
+                "Refer to this documentation on using custom VPC: "
+                "https://sagemaker.readthedocs.io/en/v2.24.0/overview.html"
+                "#secure-training-and-inference-with-vpc"
+            )
+            logger.error("Check troubleshooting guide for common errors: %s", troubleshooting)
+
             raise RuntimeError(
                 "When setting up custom VPC, both subnets and security_group_ids must be set"
             )
 
         if self.instance_type in ("local", "local_gpu"):
             if self.instance_type == "local_gpu" and self.instance_count > 1:
-                raise RuntimeError("Distributed Training in Local GPU is not supported")
+                raise RuntimeError(
+                    "Distributed Training in Local GPU is not supported."
+                    " Set instance_count to 1."
+                )
             self.sagemaker_session = sagemaker_session or LocalSession()
             if not isinstance(self.sagemaker_session, sagemaker.local.LocalSession):
                 raise RuntimeError(
                     "instance_type local or local_gpu is only supported with an"
-                    "instance of LocalSession"
+                    "instance of LocalSession. More details on local mode: "
+                    "https://sagemaker.readthedocs.io/en/stable/overview.html#local-mode"
                 )
         else:
             self.sagemaker_session = sagemaker_session or Session()
@@ -631,7 +642,11 @@ def __init__(
             and not is_pipeline_variable(output_path)
             and output_path.startswith("file://")
         ):
-            raise RuntimeError("file:// output paths are only supported in Local Mode")
+            raise RuntimeError(
+                "The 'file://' output paths are only supported when using Local Mode. "
+                "To resolve this issue, ensure you're running in Local Mode with a LocalSession, "
+                "or use an 's3://' output path for jobs running on SageMaker instances."
+            )
         self.output_path = output_path
         self.latest_training_job = None
         self.jobs = []
@@ -646,7 +661,12 @@ def __init__(
             # Now we marked that as Optional because we can fetch it from SageMakerConfig
             # Because of marking that parameter as optional, we should validate if it is None, even
             # after fetching the config.
-            raise ValueError("An AWS IAM role is required to create an estimator.")
+            raise ValueError(
+                "An AWS IAM role is required to create an estimator. "
+                "Please provide a valid `role` argument with the ARN of an IAM role"
+                " that has the necessary SageMaker permissions."
+            )
+
         self.output_kms_key = resolve_value_from_config(
             output_kms_key, TRAINING_JOB_KMS_KEY_ID_PATH, sagemaker_session=self.sagemaker_session
         )
@@ -1855,6 +1875,8 @@ def model_data(self):
             if compression_type not in {"GZIP", "NONE"}:
                 raise ValueError(
                     f'Unrecognized training job output data compression type "{compression_type}"'
+                    '. Please specify either "GZIP" or "NONE" as valid options for '
+                    "the compression type."
                 )
             # model data is in uncompressed form NOTE SageMaker Hosting mandates presence of
             # trailing forward slash in S3 model data URI, so append one if necessary.
diff --git a/src/sagemaker/local/entities.py b/src/sagemaker/local/entities.py
index 2ce37f68bd..a21a375f54 100644
--- a/src/sagemaker/local/entities.py
+++ b/src/sagemaker/local/entities.py
@@ -213,6 +213,10 @@ def start(self, input_data_config, output_data_config, hyperparameters, environm
             hyperparameters (dict): The HyperParameters for the training job.
             environment (dict): The collection of environment variables passed to the job.
             job_name (str): Name of the local training job being run.
+
+        Raises:
+            ValueError: If the input data configuration is not valid.
+            RuntimeError: If the data distribution type is not supported.
         """
         for channel in input_data_config:
             if channel["DataSource"] and "S3DataSource" in channel["DataSource"]:
@@ -233,10 +237,12 @@ def start(self, input_data_config, output_data_config, hyperparameters, environm
             # use a single Data URI - this makes handling S3 and File Data easier down the stack
             channel["DataUri"] = data_uri
 
-            if data_distribution and data_distribution != "FullyReplicated":
+            supported_distributions = ["FullyReplicated"]
+            if data_distribution and data_distribution not in supported_distributions:
                 raise RuntimeError(
-                    "DataDistribution: %s is not currently supported in Local Mode"
-                    % data_distribution
+                    "Invalid DataDistribution: '{}'. Local mode currently supports: {}.".format(
+                        data_distribution, ", ".join(supported_distributions)
+                    )
                 )
 
         self.start_time = datetime.datetime.now()
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
index 7035d9547d..b10a809259 100644
--- a/src/sagemaker/session.py
+++ b/src/sagemaker/session.py
@@ -950,6 +950,11 @@ def train(  # noqa: C901
                     }
         Returns:
             str: ARN of the training job, if it is created.
+
+        Raises:
+            - botocore.exceptions.ClientError: If Sagemaker throws an exception while creating
+            training job.
+            - ValueError: If both image_uri and algorithm are provided, or if neither is provided.
         """
         tags = _append_project_tags(format_tags(tags))
         tags = self._append_sagemaker_config_tags(
@@ -1033,9 +1038,19 @@ def train(  # noqa: C901
         )
 
         def submit(request):
-            logger.info("Creating training-job with name: %s", job_name)
-            logger.debug("train request: %s", json.dumps(request, indent=4))
-            self.sagemaker_client.create_training_job(**request)
+            try:
+                logger.info("Creating training-job with name: %s", job_name)
+                logger.debug("train request: %s", json.dumps(request, indent=4))
+                self.sagemaker_client.create_training_job(**request)
+            except Exception as e:
+                troubleshooting = (
+                    "https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html"
+                    "#sagemaker-python-sdk-troubleshooting-create-training-job"
+                )
+                logger.error(
+                    "Please check the troubleshooting guide for common errors: %s", troubleshooting
+                )
+                raise e
 
         self._intercept_create_request(train_request, submit, self.train.__name__)
 
@@ -1342,6 +1357,15 @@ def update_training_job(
                     remote_debug_config = {
                         "EnableRemoteDebug": True,
                     }
+
+        Returns:
+            str: ARN of training job
+
+        Raises:
+            - botocore.exceptions.ClientError: If Sagemaker throws an error while updating training
+            job.
+            - botocore.exceptions.ParamValidationError: If any request parameters are in an invalid
+            format.
         """
         # No injections from sagemaker_config because the UpdateTrainingJob API's resource_config
         # object accepts fewer parameters than the CreateTrainingJob API, and none that the
@@ -1356,9 +1380,28 @@ def update_training_job(
             resource_config=resource_config,
             remote_debug_config=remote_debug_config,
         )
-        logger.info("Updating training job with name %s", job_name)
-        logger.debug("Update request: %s", json.dumps(update_training_job_request, indent=4))
-        self.sagemaker_client.update_training_job(**update_training_job_request)
+        try:
+            logger.info("Updating training job with name %s", job_name)
+            logger.debug("Update request: %s", json.dumps(update_training_job_request, indent=4))
+            self.sagemaker_client.update_training_job(**update_training_job_request)
+        except botocore.exceptions.ParamValidationError as e:
+            troubleshooting = (
+                "Incorrect request parameter was provided. Check the API documentation: "
+                "https://docs.aws.amazon.com/sagemaker/latest/APIReference/"
+                "API_UpdateTrainingJob.html#API_UpdateTrainingJob_RequestParameters"
+            )
+            logger.error("%s", troubleshooting)
+            raise e
+        except botocore.exceptions.ClientError as e:
+            troubleshooting = (
+                "https://docs.aws.amazon.com/sagemaker/latest/dg/"
+                "sagemaker-python-sdk-troubleshooting.html"
+                "#sagemaker-python-sdk-troubleshooting-update-training-job"
+            )
+            logger.error(
+                "Please check the troubleshooting guide for common errors: %s", troubleshooting
+            )
+            raise e
 
     def _get_update_training_job_request(
         self,
@@ -1461,6 +1504,10 @@ def process(
                 * If both `ExperimentName` and `TrialName` are not supplied the trial component
                 will be unassociated.
                 * `TrialComponentDisplayName` is used for display in Studio.
+
+        Raises:
+            - botocore.exceptions.ClientError: If Sagemaker throws an error while creating
+            processing job.
         """
         tags = _append_project_tags(format_tags(tags))
         tags = self._append_sagemaker_config_tags(
@@ -1524,9 +1571,20 @@ def process(
         )
 
         def submit(request):
-            logger.info("Creating processing-job with name %s", job_name)
-            logger.debug("process request: %s", json.dumps(request, indent=4))
-            self.sagemaker_client.create_processing_job(**request)
+            try:
+                logger.info("Creating processing-job with name %s", job_name)
+                logger.debug("process request: %s", json.dumps(request, indent=4))
+                self.sagemaker_client.create_processing_job(**request)
+            except Exception as e:
+                troubleshooting = (
+                    "https://docs.aws.amazon.com/sagemaker/latest/dg/"
+                    "sagemaker-python-sdk-troubleshooting.html"
+                    "#sagemaker-python-sdk-troubleshooting-create-processing-job"
+                )
+                logger.error(
+                    "Please check the troubleshooting guide for common errors: %s", troubleshooting
+                )
+                raise e
 
         self._intercept_create_request(process_request, submit, self.process.__name__)
 
@@ -4573,6 +4631,10 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live
 
         Returns:
             str: Name of the Amazon SageMaker ``Endpoint`` created.
+
+        Raises:
+            botocore.exceptions.ClientError: If Sagemaker throws an exception while creating
+            endpoint.
         """
         logger.info("Creating endpoint with name %s", endpoint_name)
 
@@ -4581,16 +4643,26 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live
         tags = self._append_sagemaker_config_tags(
             tags, "{}.{}.{}".format(SAGEMAKER, ENDPOINT, TAGS)
         )
-
-        res = self.sagemaker_client.create_endpoint(
-            EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=tags
-        )
-        if res:
-            self.endpoint_arn = res["EndpointArn"]
-
-        if wait:
-            self.wait_for_endpoint(endpoint_name, live_logging=live_logging)
-        return endpoint_name
+        try:
+            res = self.sagemaker_client.create_endpoint(
+                EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=tags
+            )
+            if res:
+                self.endpoint_arn = res["EndpointArn"]
+
+            if wait:
+                self.wait_for_endpoint(endpoint_name, live_logging=live_logging)
+            return endpoint_name
+        except Exception as e:
+            troubleshooting = (
+                "https://docs.aws.amazon.com/sagemaker/latest/dg/"
+                "sagemaker-python-sdk-troubleshooting.html"
+                "#sagemaker-python-sdk-troubleshooting-create-endpoint"
+            )
+            logger.error(
+                "Please check the troubleshooting guide for common errors: %s", troubleshooting
+            )
+            raise e
 
     def endpoint_in_service_or_not(self, endpoint_name: str):
         """Check whether an Amazon SageMaker ``Endpoint``` is in IN_SERVICE status.
@@ -4635,7 +4707,9 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True):
             str: Name of the Amazon SageMaker ``Endpoint`` being updated.
 
         Raises:
-            ValueError: if the endpoint does not already exist
+            - ValueError: if the endpoint does not already exist
+            - botocore.exceptions.ClientError: If SageMaker throws an error while
+            creating endpoint config, describing endpoint or updating endpoint
         """
         if not _deployment_entity_exists(
             lambda: self.sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
@@ -4645,15 +4719,27 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True):
                 "existing endpoint name".format(endpoint_name)
             )
 
-        res = self.sagemaker_client.update_endpoint(
-            EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
-        )
-        if res:
-            self.endpoint_arn = res["EndpointArn"]
+        try:
 
-        if wait:
-            self.wait_for_endpoint(endpoint_name)
-        return endpoint_name
+            res = self.sagemaker_client.update_endpoint(
+                EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
+            )
+            if res:
+                self.endpoint_arn = res["EndpointArn"]
+
+            if wait:
+                self.wait_for_endpoint(endpoint_name)
+            return endpoint_name
+        except Exception as e:
+            troubleshooting = (
+                "https://docs.aws.amazon.com/sagemaker/latest/dg/"
+                "sagemaker-python-sdk-troubleshooting.html"
+                "#sagemaker-python-sdk-troubleshooting-update-endpoint"
+            )
+            logger.error(
+                "Please check the troubleshooting guide for common errors: %s", troubleshooting
+            )
+            raise e
 
     def is_inference_component_based_endpoint(self, endpoint_name):
         """Returns 'True' if endpoint is inference-component-based, 'False' otherwise.
@@ -4934,7 +5020,7 @@ def update_inference_component(
         return inference_component_name
 
     def delete_inference_component(self, inference_component_name: str, wait: bool = False):
-        """Deletes a InferenceComponent.
+        """Deletes an InferenceComponent.
 
         Args:
             inference_component_name (str): Name of the Amazon SageMaker ``InferenceComponent``
@@ -8502,8 +8588,19 @@ def _check_job_status(job, desc, status_key_name):
     elif status != "Completed":
         reason = desc.get("FailureReason", "(No reason provided)")
         job_type = status_key_name.replace("JobStatus", " job")
-        message = "Error for {job_type} {job_name}: {status}. Reason: {reason}".format(
-            job_type=job_type, job_name=job, status=status, reason=reason
+        troubleshooting = (
+            "https://docs.aws.amazon.com/sagemaker/latest/dg/"
+            "sagemaker-python-sdk-troubleshooting.html"
+        )
+        message = (
+            "Error for {job_type} {job_name}: {status}. Reason: {reason}. "
+            "Check troubleshooting guide for common errors: {troubleshooting}"
+        ).format(
+            job_type=job_type,
+            job_name=job,
+            status=status,
+            reason=reason,
+            troubleshooting=troubleshooting,
         )
         if "CapacityError" in str(reason):
             raise exceptions.CapacityError(