Skip to content

change: Add troubleshooting links to exceptions #4844

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ max-returns=6
max-branches=12

# Maximum number of statements in function / method body
max-statements=100
max-statements=105

# Maximum number of parents for a class (see R0901).
max-parents=7
Expand Down
14 changes: 14 additions & 0 deletions src/sagemaker/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,20 @@ def __init__(
available (default: ``None``).
**kwargs: Additional kwargs. This is unused. It's only added for AlgorithmEstimator
to ignore the irrelevant arguments.
Raises:
ValueError:
- If an AWS IAM Role is not provided.
- Bad value for instance type.
RuntimeError:
- When setting up custom VPC, both subnets and security_group_ids are not provided
- If instance_count > 1 (distributed training) with instance type local or local gpu
- If LocalSession is not used with instance type local or local gpu
- file:// output path used outside of local mode
botocore.exceptions.ClientError:
- algorithm arn is incorrect
- insufficient permission to access/ describe algorithm
- algorithm is in a different region
"""
self.algorithm_arn = algorithm_arn
super(AlgorithmEstimator, self).__init__(
Expand Down
2 changes: 2 additions & 0 deletions src/sagemaker/base_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,8 @@ def update_endpoint(
- If ``initial_instance_count``, ``instance_type``, or ``accelerator_type`` is
specified and either ``model_name`` is ``None`` or there are multiple models
associated with the endpoint.
botocore.exceptions.ClientError: If SageMaker throws an error while creating
endpoint config, describing endpoint or updating endpoint
"""
production_variants = None
current_model_names = self._get_model_names()
Expand Down
32 changes: 27 additions & 5 deletions src/sagemaker/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,25 +590,36 @@ def __init__(
self.dependencies = dependencies or []
self.uploaded_code: Optional[UploadedCode] = None

# Check that the user properly sets both subnet and secutiry_groupe_ids
# Check that the user properly sets both subnet and security_group_ids
if (
subnets is not None
and security_group_ids is None
or security_group_ids is not None
and subnets is None
):
troubleshooting = (
"Refer to this documentation on using custom VPC: "
"https://sagemaker.readthedocs.io/en/v2.24.0/overview.html"
"#secure-training-and-inference-with-vpc"
)
logger.error("Check troubleshooting guide for common errors: %s", troubleshooting)

raise RuntimeError(
"When setting up custom VPC, both subnets and security_group_ids must be set"
)

if self.instance_type in ("local", "local_gpu"):
if self.instance_type == "local_gpu" and self.instance_count > 1:
raise RuntimeError("Distributed Training in Local GPU is not supported")
raise RuntimeError(
"Distributed Training in Local GPU is not supported."
" Set instance_count to 1."
)
self.sagemaker_session = sagemaker_session or LocalSession()
if not isinstance(self.sagemaker_session, sagemaker.local.LocalSession):
raise RuntimeError(
"instance_type local or local_gpu is only supported with an"
"instance of LocalSession"
"instance of LocalSession. More details on local mode: "
"https://sagemaker.readthedocs.io/en/stable/overview.html#local-mode"
)
else:
self.sagemaker_session = sagemaker_session or Session()
Expand All @@ -631,7 +642,11 @@ def __init__(
and not is_pipeline_variable(output_path)
and output_path.startswith("file://")
):
raise RuntimeError("file:// output paths are only supported in Local Mode")
raise RuntimeError(
"The 'file://' output paths are only supported when using Local Mode. "
"To resolve this issue, ensure you're running in Local Mode with a LocalSession, "
"or use an 's3://' output path for jobs running on SageMaker instances."
)
self.output_path = output_path
self.latest_training_job = None
self.jobs = []
Expand All @@ -646,7 +661,12 @@ def __init__(
# Now we marked that as Optional because we can fetch it from SageMakerConfig
# Because of marking that parameter as optional, we should validate if it is None, even
# after fetching the config.
raise ValueError("An AWS IAM role is required to create an estimator.")
raise ValueError(
"An AWS IAM role is required to create an estimator. "
"Please provide a valid `role` argument with the ARN of an IAM role"
" that has the necessary SageMaker permissions."
)

self.output_kms_key = resolve_value_from_config(
output_kms_key, TRAINING_JOB_KMS_KEY_ID_PATH, sagemaker_session=self.sagemaker_session
)
Expand Down Expand Up @@ -1855,6 +1875,8 @@ def model_data(self):
if compression_type not in {"GZIP", "NONE"}:
raise ValueError(
f'Unrecognized training job output data compression type "{compression_type}"'
'. Please specify either "GZIP" or "NONE" as valid options for '
"the compression type."
)
# model data is in uncompressed form NOTE SageMaker Hosting mandates presence of
# trailing forward slash in S3 model data URI, so append one if necessary.
Expand Down
12 changes: 9 additions & 3 deletions src/sagemaker/local/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,10 @@ def start(self, input_data_config, output_data_config, hyperparameters, environm
hyperparameters (dict): The HyperParameters for the training job.
environment (dict): The collection of environment variables passed to the job.
job_name (str): Name of the local training job being run.

Raises:
ValueError: If the input data configuration is not valid.
RuntimeError: If the data distribution type is not supported.
"""
for channel in input_data_config:
if channel["DataSource"] and "S3DataSource" in channel["DataSource"]:
Expand All @@ -233,10 +237,12 @@ def start(self, input_data_config, output_data_config, hyperparameters, environm
# use a single Data URI - this makes handling S3 and File Data easier down the stack
channel["DataUri"] = data_uri

if data_distribution and data_distribution != "FullyReplicated":
supported_distributions = ["FullyReplicated"]
if data_distribution and data_distribution not in supported_distributions:
raise RuntimeError(
"DataDistribution: %s is not currently supported in Local Mode"
% data_distribution
"Invalid DataDistribution: '{}'. Local mode currently supports: {}.".format(
data_distribution, ", ".join(supported_distributions)
)
)

self.start_time = datetime.datetime.now()
Expand Down
159 changes: 128 additions & 31 deletions src/sagemaker/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,6 +950,11 @@ def train( # noqa: C901
}
Returns:
str: ARN of the training job, if it is created.

Raises:
- botocore.exceptions.ClientError: If Sagemaker throws an exception while creating
training job.
- ValueError: If both image_uri and algorithm are provided, or if neither is provided.
"""
tags = _append_project_tags(format_tags(tags))
tags = self._append_sagemaker_config_tags(
Expand Down Expand Up @@ -1033,9 +1038,19 @@ def train( # noqa: C901
)

def submit(request):
logger.info("Creating training-job with name: %s", job_name)
logger.debug("train request: %s", json.dumps(request, indent=4))
self.sagemaker_client.create_training_job(**request)
try:
logger.info("Creating training-job with name: %s", job_name)
logger.debug("train request: %s", json.dumps(request, indent=4))
self.sagemaker_client.create_training_job(**request)
except Exception as e:
troubleshooting = (
"https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html"
"#sagemaker-python-sdk-troubleshooting-create-training-job"
)
logger.error(
"Please check the troubleshooting guide for common errors: %s", troubleshooting
)
raise e

self._intercept_create_request(train_request, submit, self.train.__name__)

Expand Down Expand Up @@ -1342,6 +1357,15 @@ def update_training_job(
remote_debug_config = {
"EnableRemoteDebug": True,
}

Returns:
str: ARN of training job

Raises:
- botocore.exceptions.ClientError: If Sagemaker throws an error while updating training
job.
- botocore.exceptions.ParamValidationError: If any request parameters are in an invalid
format.
"""
# No injections from sagemaker_config because the UpdateTrainingJob API's resource_config
# object accepts fewer parameters than the CreateTrainingJob API, and none that the
Expand All @@ -1356,9 +1380,28 @@ def update_training_job(
resource_config=resource_config,
remote_debug_config=remote_debug_config,
)
logger.info("Updating training job with name %s", job_name)
logger.debug("Update request: %s", json.dumps(update_training_job_request, indent=4))
self.sagemaker_client.update_training_job(**update_training_job_request)
try:
logger.info("Updating training job with name %s", job_name)
logger.debug("Update request: %s", json.dumps(update_training_job_request, indent=4))
self.sagemaker_client.update_training_job(**update_training_job_request)
except botocore.exceptions.ParamValidationError as e:
troubleshooting = (
"Incorrect request parameter was provided. Check the API documentation: "
"https://docs.aws.amazon.com/sagemaker/latest/APIReference/"
"API_UpdateTrainingJob.html#API_UpdateTrainingJob_RequestParameters"
)
logger.error("%s", troubleshooting)
raise e
except botocore.exceptions.ClientError as e:
troubleshooting = (
"https://docs.aws.amazon.com/sagemaker/latest/dg/"
"sagemaker-python-sdk-troubleshooting.html"
"#sagemaker-python-sdk-troubleshooting-update-training-job"
)
logger.error(
"Please check the troubleshooting guide for common errors: %s", troubleshooting
)
raise e

def _get_update_training_job_request(
self,
Expand Down Expand Up @@ -1461,6 +1504,10 @@ def process(
* If both `ExperimentName` and `TrialName` are not supplied the trial component
will be unassociated.
* `TrialComponentDisplayName` is used for display in Studio.

Raises:
- botocore.exceptions.ClientError: If Sagemaker throws an error while creating
processing job.
"""
tags = _append_project_tags(format_tags(tags))
tags = self._append_sagemaker_config_tags(
Expand Down Expand Up @@ -1524,9 +1571,20 @@ def process(
)

def submit(request):
logger.info("Creating processing-job with name %s", job_name)
logger.debug("process request: %s", json.dumps(request, indent=4))
self.sagemaker_client.create_processing_job(**request)
try:
logger.info("Creating processing-job with name %s", job_name)
logger.debug("process request: %s", json.dumps(request, indent=4))
self.sagemaker_client.create_processing_job(**request)
except Exception as e:
troubleshooting = (
"https://docs.aws.amazon.com/sagemaker/latest/dg/"
"sagemaker-python-sdk-troubleshooting.html"
"#sagemaker-python-sdk-troubleshooting-create-processing-job"
)
logger.error(
"Please check the troubleshooting guide for common errors: %s", troubleshooting
)
raise e

self._intercept_create_request(process_request, submit, self.process.__name__)

Expand Down Expand Up @@ -4573,6 +4631,10 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live

Returns:
str: Name of the Amazon SageMaker ``Endpoint`` created.

Raises:
botocore.exceptions.ClientError: If Sagemaker throws an exception while creating
endpoint.
"""
logger.info("Creating endpoint with name %s", endpoint_name)

Expand All @@ -4581,16 +4643,26 @@ def create_endpoint(self, endpoint_name, config_name, tags=None, wait=True, live
tags = self._append_sagemaker_config_tags(
tags, "{}.{}.{}".format(SAGEMAKER, ENDPOINT, TAGS)
)

res = self.sagemaker_client.create_endpoint(
EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=tags
)
if res:
self.endpoint_arn = res["EndpointArn"]

if wait:
self.wait_for_endpoint(endpoint_name, live_logging=live_logging)
return endpoint_name
try:
res = self.sagemaker_client.create_endpoint(
EndpointName=endpoint_name, EndpointConfigName=config_name, Tags=tags
)
if res:
self.endpoint_arn = res["EndpointArn"]

if wait:
self.wait_for_endpoint(endpoint_name, live_logging=live_logging)
return endpoint_name
except Exception as e:
troubleshooting = (
"https://docs.aws.amazon.com/sagemaker/latest/dg/"
"sagemaker-python-sdk-troubleshooting.html"
"#sagemaker-python-sdk-troubleshooting-create-endpoint"
)
logger.error(
"Please check the troubleshooting guide for common errors: %s", troubleshooting
)
raise e

def endpoint_in_service_or_not(self, endpoint_name: str):
"""Check whether an Amazon SageMaker ``Endpoint``` is in IN_SERVICE status.
Expand Down Expand Up @@ -4635,7 +4707,9 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True):
str: Name of the Amazon SageMaker ``Endpoint`` being updated.

Raises:
ValueError: if the endpoint does not already exist
- ValueError: if the endpoint does not already exist
- botocore.exceptions.ClientError: If SageMaker throws an error while
creating endpoint config, describing endpoint or updating endpoint
"""
if not _deployment_entity_exists(
lambda: self.sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
Expand All @@ -4645,15 +4719,27 @@ def update_endpoint(self, endpoint_name, endpoint_config_name, wait=True):
"existing endpoint name".format(endpoint_name)
)

res = self.sagemaker_client.update_endpoint(
EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)
if res:
self.endpoint_arn = res["EndpointArn"]
try:

if wait:
self.wait_for_endpoint(endpoint_name)
return endpoint_name
res = self.sagemaker_client.update_endpoint(
EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
)
if res:
self.endpoint_arn = res["EndpointArn"]

if wait:
self.wait_for_endpoint(endpoint_name)
return endpoint_name
except Exception as e:
troubleshooting = (
"https://docs.aws.amazon.com/sagemaker/latest/dg/"
"sagemaker-python-sdk-troubleshooting.html"
"#sagemaker-python-sdk-troubleshooting-update-endpoint"
)
logger.error(
"Please check the troubleshooting guide for common errors: %s", troubleshooting
)
raise e

def is_inference_component_based_endpoint(self, endpoint_name):
"""Returns 'True' if endpoint is inference-component-based, 'False' otherwise.
Expand Down Expand Up @@ -4934,7 +5020,7 @@ def update_inference_component(
return inference_component_name

def delete_inference_component(self, inference_component_name: str, wait: bool = False):
"""Deletes a InferenceComponent.
"""Deletes an InferenceComponent.

Args:
inference_component_name (str): Name of the Amazon SageMaker ``InferenceComponent``
Expand Down Expand Up @@ -8502,8 +8588,19 @@ def _check_job_status(job, desc, status_key_name):
elif status != "Completed":
reason = desc.get("FailureReason", "(No reason provided)")
job_type = status_key_name.replace("JobStatus", " job")
message = "Error for {job_type} {job_name}: {status}. Reason: {reason}".format(
job_type=job_type, job_name=job, status=status, reason=reason
troubleshooting = (
"https://docs.aws.amazon.com/sagemaker/latest/dg/"
"sagemaker-python-sdk-troubleshooting.html"
)
message = (
"Error for {job_type} {job_name}: {status}. Reason: {reason}. "
"Check troubleshooting guide for common errors: {troubleshooting}"
).format(
job_type=job_type,
job_name=job,
status=status,
reason=reason,
troubleshooting=troubleshooting,
)
if "CapacityError" in str(reason):
raise exceptions.CapacityError(
Expand Down