Skip to content
14 changes: 14 additions & 0 deletions src/sagemaker/jumpstart/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,6 +817,20 @@ def deploy(
f"{EndpointType.INFERENCE_COMPONENT_BASED} is not supported for Proprietary models."
)

# No resources given to deploy() but present 'resources' key in deploy_kwargs means default
# JumpStart resource requirements are being used
if hasattr(self, "_is_sharded_model") and not resources and deploy_kwargs.resources:
if (
self._is_sharded_model
and deploy_kwargs.resources.num_cpus
and deploy_kwargs.resources.num_cpus > 0
):
JUMPSTART_LOGGER.warning(
"NumOfCpuCoresRequired should be 0 for the best experience with SageMaker Fast "
"Model Loading. Overriding the requested `num_cpus` to 0."
)
deploy_kwargs.resources.num_cpus = 0

self.additional_model_data_sources = _add_model_access_configs_to_model_data_sources(
self.additional_model_data_sources,
deploy_kwargs.model_access_configs,
Expand Down
7 changes: 4 additions & 3 deletions src/sagemaker/jumpstart/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1595,9 +1595,10 @@ def _add_model_access_configs_to_model_data_sources(
)
acked_model_data_sources.append(mutable_model_data_source)
else:
mutable_model_data_source.pop(
"HostingEulaKey"
) # pop when model access config is not applicable
if "HostingEulaKey" in mutable_model_data_source:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a test for this in test_model_builder as well in addition to what you already have ? This is a very critical path in ModelBuilder

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tests/integ/sagemaker/serve/test_serve_js_deep_unit_tests.py under test_js_model_with_optimize_speculative_decoding_config_gated_requests_are_expected

mutable_model_data_source.pop(
"HostingEulaKey"
) # pop when model access config is not applicable
acked_model_data_sources.append(mutable_model_data_source)
return acked_model_data_sources

Expand Down
29 changes: 18 additions & 11 deletions src/sagemaker/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1600,18 +1600,25 @@ def deploy(
if self._base_name is not None:
self._base_name = "-".join((self._base_name, compiled_model_suffix))

if self._is_sharded_model and endpoint_type != EndpointType.INFERENCE_COMPONENT_BASED:
logging.warning(
"Forcing INFERENCE_COMPONENT_BASED endpoint for sharded model. ADVISORY - "
"Use INFERENCE_COMPONENT_BASED endpoints over MODEL_BASED endpoints."
)
endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED
if self._is_sharded_model:
if endpoint_type != EndpointType.INFERENCE_COMPONENT_BASED:
logging.warning(
"Forcing INFERENCE_COMPONENT_BASED endpoint for sharded model. ADVISORY - "
"Use INFERENCE_COMPONENT_BASED endpoints over MODEL_BASED endpoints."
)
endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED

if self._is_sharded_model and self._enable_network_isolation:
raise ValueError(
"EnableNetworkIsolation cannot be set to True since SageMaker Fast Model "
"Loading of model requires network access."
)
if self._enable_network_isolation:
raise ValueError(
"EnableNetworkIsolation cannot be set to True since SageMaker Fast Model "
"Loading of model requires network access."
)

if resources and resources.num_cpus and resources.num_cpus > 0:
logger.warning(
"NumberOfCpuCoresRequired should be 0 for the best experience with SageMaker "
"Fast Model Loading. Configure by setting `num_cpus` to 0 in `resources`."
)

# Support multiple models on same endpoint
if endpoint_type == EndpointType.INFERENCE_COMPONENT_BASED:
Expand Down
4 changes: 4 additions & 0 deletions src/sagemaker/serve/builder/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1302,6 +1302,10 @@ def _model_builder_optimize_wrapper(
job_name = job_name or f"modelbuilderjob-{uuid.uuid4().hex}"
if self._is_jumpstart_model_id():
self.build(mode=self.mode, sagemaker_session=self.sagemaker_session)
if self.pysdk_model:
self.pysdk_model.set_deployment_config(
instance_type=instance_type, config_name="lmi"
)
input_args = self._optimize_for_jumpstart(
output_path=output_path,
instance_type=instance_type,
Expand Down
22 changes: 22 additions & 0 deletions tests/unit/sagemaker/jumpstart/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2318,6 +2318,28 @@ def test_multiple_gated_additional_model_data_source_should_accept_both(self):
+ self.MOCK_GATED_DEPLOY_CONFIG_ADDITIONAL_MODEL_DATA_SOURCE_POST_CALL
)

def test_gated_additional_model_data_source_already_accepted_with_no_hosting_eula_key_should_pass_through(
self,
):
mock_gated_deploy_config_additional_model_data_pre_accepted = [
{
"ChannelName": "draft_model",
"S3DataSource": {
"CompressionType": "None",
"S3DataType": "S3Prefix",
"S3Uri": "s3://jumpstart_bucket/path/to/gated/resources/",
"ModelAccessConfig": {"AcceptEula": True},
},
}
]

utils._add_model_access_configs_to_model_data_sources(
model_data_sources=mock_gated_deploy_config_additional_model_data_pre_accepted,
model_access_configs={self.MOCK_GATED_MODEL_ID: ModelAccessConfig(accept_eula=False)},
model_id=self.MOCK_GATED_MODEL_ID,
region=JUMPSTART_DEFAULT_REGION_NAME,
)

# Mixed Positive Cases

def test_multiple_mixed_additional_model_data_source_should_pass_through_one_accept_the_other(
Expand Down
44 changes: 44 additions & 0 deletions tests/unit/sagemaker/model/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1482,3 +1482,47 @@ def test_model_source(
)

assert model_1._get_model_uri() == "s3://tmybuckaet"


@patch("sagemaker.utils.repack_model")
@patch("sagemaker.fw_utils.tar_and_upload_dir")
def test_deploy_sharded_model_with_cpus_requested_raises_warning(
repack_model, tar_and_upload_dir, sagemaker_session
):
framework_model_classes_to_kwargs = {
HuggingFaceModel: {
"pytorch_version": "1.7.1",
"py_version": "py36",
"transformers_version": "4.6.1",
},
}

sagemaker_session.settings = SessionSettings(include_jumpstart_tags=False)

source_dir = "s3://blah/blah/blah"
for framework_model_class, kwargs in framework_model_classes_to_kwargs.items():
test_sharded_model = framework_model_class(
entry_point=ENTRY_POINT_INFERENCE,
role=ROLE,
sagemaker_session=sagemaker_session,
model_data=source_dir,
**kwargs,
)
test_sharded_model._is_sharded_model = True
from unittest import mock

with mock.patch("sagemaker.model.logger") as mock_logger:
mock_logger.warning.reset_mock()
test_sharded_model.deploy(
instance_type="ml.m2.xlarge",
initial_instance_count=INSTANCE_COUNT,
endpoint_type=EndpointType.MODEL_BASED,
resources=ResourceRequirements(
requests={"num_accelerators": 1, "memory": 8192, "copies": 1, "num_cpus": 1},
limits={},
),
)
mock_logger.warning.assert_called_once_with(
"NumberOfCpuCoresRequired should be 0 for the best experience with SageMaker "
"Fast Model Loading. Configure by setting `num_cpus` to 0 in `resources`."
)
Loading