diff --git a/src/sagemaker/jumpstart/model.py b/src/sagemaker/jumpstart/model.py index 65bb156ee3..c173ae55ff 100644 --- a/src/sagemaker/jumpstart/model.py +++ b/src/sagemaker/jumpstart/model.py @@ -817,6 +817,20 @@ def deploy( f"{EndpointType.INFERENCE_COMPONENT_BASED} is not supported for Proprietary models." ) + # No resources given to deploy() but present 'resources' key in deploy_kwargs means default + # JumpStart resource requirements are being used + if hasattr(self, "_is_sharded_model") and not resources and deploy_kwargs.resources: + if ( + self._is_sharded_model + and deploy_kwargs.resources.num_cpus + and deploy_kwargs.resources.num_cpus > 0 + ): + JUMPSTART_LOGGER.warning( + "NumOfCpuCoresRequired should be 0 for the best experience with SageMaker Fast " + "Model Loading. Overriding the requested `num_cpus` to 0." + ) + deploy_kwargs.resources.num_cpus = 0 + self.additional_model_data_sources = _add_model_access_configs_to_model_data_sources( self.additional_model_data_sources, deploy_kwargs.model_access_configs, diff --git a/src/sagemaker/jumpstart/utils.py b/src/sagemaker/jumpstart/utils.py index dfe3d7f1dd..d5c769efe0 100644 --- a/src/sagemaker/jumpstart/utils.py +++ b/src/sagemaker/jumpstart/utils.py @@ -1595,9 +1595,10 @@ def _add_model_access_configs_to_model_data_sources( ) acked_model_data_sources.append(mutable_model_data_source) else: - mutable_model_data_source.pop( - "HostingEulaKey" - ) # pop when model access config is not applicable + if "HostingEulaKey" in mutable_model_data_source: + mutable_model_data_source.pop( + "HostingEulaKey" + ) # pop when model access config is not applicable acked_model_data_sources.append(mutable_model_data_source) return acked_model_data_sources diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index 83efa57cb8..b78a4a2a64 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -1600,18 +1600,25 @@ def deploy( if self._base_name is not None: self._base_name = "-".join((self._base_name, compiled_model_suffix)) - if self._is_sharded_model and endpoint_type != EndpointType.INFERENCE_COMPONENT_BASED: - logging.warning( - "Forcing INFERENCE_COMPONENT_BASED endpoint for sharded model. ADVISORY - " - "Use INFERENCE_COMPONENT_BASED endpoints over MODEL_BASED endpoints." - ) - endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED + if self._is_sharded_model: + if endpoint_type != EndpointType.INFERENCE_COMPONENT_BASED: + logging.warning( + "Forcing INFERENCE_COMPONENT_BASED endpoint for sharded model. ADVISORY - " + "Use INFERENCE_COMPONENT_BASED endpoints over MODEL_BASED endpoints." + ) + endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED - if self._is_sharded_model and self._enable_network_isolation: - raise ValueError( - "EnableNetworkIsolation cannot be set to True since SageMaker Fast Model " - "Loading of model requires network access." - ) + if self._enable_network_isolation: + raise ValueError( + "EnableNetworkIsolation cannot be set to True since SageMaker Fast Model " + "Loading of model requires network access." + ) + + if resources and resources.num_cpus and resources.num_cpus > 0: + logger.warning( + "NumberOfCpuCoresRequired should be 0 for the best experience with SageMaker " + "Fast Model Loading. Configure by setting `num_cpus` to 0 in `resources`." + ) # Support multiple models on same endpoint if endpoint_type == EndpointType.INFERENCE_COMPONENT_BASED: @@ -1655,7 +1662,7 @@ def deploy( vpc_config=self.vpc_config, enable_network_isolation=self._enable_network_isolation, role=self.role, - live_logging=endpoint_logging, + live_logging=False, # TODO: enable when IC supports this wait=wait, ) diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index 6a3b093ac5..802711e427 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -1302,6 +1302,10 @@ def _model_builder_optimize_wrapper( job_name = job_name or f"modelbuilderjob-{uuid.uuid4().hex}" if self._is_jumpstart_model_id(): self.build(mode=self.mode, sagemaker_session=self.sagemaker_session) + if self.pysdk_model: + self.pysdk_model.set_deployment_config( + instance_type=instance_type, config_name="lmi" + ) input_args = self._optimize_for_jumpstart( output_path=output_path, instance_type=instance_type, diff --git a/tests/integ/sagemaker/serve/test_serve_js_deep_unit_tests.py b/tests/integ/sagemaker/serve/test_serve_js_deep_unit_tests.py new file mode 100644 index 0000000000..348c57745f --- /dev/null +++ b/tests/integ/sagemaker/serve/test_serve_js_deep_unit_tests.py @@ -0,0 +1,238 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import +from unittest.mock import MagicMock, patch, ANY + +from sagemaker.session import Session +from sagemaker.serve.builder.model_builder import ModelBuilder +from sagemaker.serve.builder.schema_builder import SchemaBuilder +from sagemaker.resource_requirements import ResourceRequirements + +ROLE_NAME = "SageMakerRole" + + +def test_js_model_with_optimize_speculative_decoding_config_gated_requests_are_expected( + sagemaker_session, +): + with patch.object( + Session, "create_model", return_value="mock_model" + ) as mock_create_model, patch.object( + Session, "endpoint_from_production_variants" + ) as mock_endpoint_from_production_variants: + iam_client = sagemaker_session.boto_session.client("iam") + role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"] + + schema_builder = SchemaBuilder("test", "test") + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-1-8b-instruct", + schema_builder=schema_builder, + sagemaker_session=sagemaker_session, + role_arn=role_arn, + ) + + optimized_model = model_builder.optimize( + instance_type="ml.g5.xlarge", # set to small instance in case a network call is made + speculative_decoding_config={ + "ModelProvider": "JumpStart", + "ModelID": "meta-textgeneration-llama-3-2-1b", + "AcceptEula": True, + }, + accept_eula=True, + ) + + optimized_model.deploy() + + mock_create_model.assert_called_once_with( + name=ANY, + role=ANY, + container_defs={ + "Image": ANY, + "Environment": { + "SAGEMAKER_PROGRAM": "inference.py", + "ENDPOINT_SERVER_TIMEOUT": "3600", + "MODEL_CACHE_ROOT": "/opt/ml/model", + "SAGEMAKER_ENV": "1", + "HF_MODEL_ID": "/opt/ml/model", + "SAGEMAKER_MODEL_SERVER_WORKERS": "1", + "OPTION_SPECULATIVE_DRAFT_MODEL": "/opt/ml/additional-model-data-sources/draft_model/", + }, + "AdditionalModelDataSources": [ + { + "ChannelName": "draft_model", + "S3DataSource": { + "S3Uri": ANY, + "S3DataType": "S3Prefix", + "CompressionType": "None", + "ModelAccessConfig": {"AcceptEula": True}, + }, + } + ], + "ModelDataSource": { + "S3DataSource": { + "S3Uri": ANY, + "S3DataType": "S3Prefix", + "CompressionType": "None", + "ModelAccessConfig": {"AcceptEula": True}, + } + }, + }, + vpc_config=None, + enable_network_isolation=True, + tags=ANY, + ) + mock_endpoint_from_production_variants.assert_called_once() + + +def test_js_model_with_optimize_sharding_and_resource_requirements_requests_are_expected( + sagemaker_session, +): + with patch.object( + Session, + "wait_for_optimization_job", + return_value={"OptimizationJobName": "mock_optimization_job"}, + ), patch.object( + Session, "create_model", return_value="mock_model" + ) as mock_create_model, patch.object( + Session, "endpoint_from_production_variants", return_value="mock_endpoint_name" + ) as mock_endpoint_from_production_variants, patch.object( + Session, "create_inference_component" + ) as mock_create_inference_component: + iam_client = sagemaker_session.boto_session.client("iam") + role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"] + + sagemaker_session.sagemaker_client.create_optimization_job = MagicMock() + + schema_builder = SchemaBuilder("test", "test") + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-1-8b-instruct", + schema_builder=schema_builder, + sagemaker_session=sagemaker_session, + role_arn=role_arn, + ) + + optimized_model = model_builder.optimize( + instance_type="ml.g5.xlarge", # set to small instance in case a network call is made + sharding_config={"OverrideEnvironment": {"OPTION_TENSOR_PARALLEL_DEGREE": "8"}}, + accept_eula=True, + ) + + optimized_model.deploy( + resources=ResourceRequirements(requests={"memory": 196608, "num_accelerators": 8}) + ) + + mock_create_model.assert_called_once_with( + name=ANY, + role=ANY, + container_defs={ + "Image": ANY, + "Environment": { + "SAGEMAKER_PROGRAM": "inference.py", + "ENDPOINT_SERVER_TIMEOUT": "3600", + "MODEL_CACHE_ROOT": "/opt/ml/model", + "SAGEMAKER_ENV": "1", + "HF_MODEL_ID": "/opt/ml/model", + "SAGEMAKER_MODEL_SERVER_WORKERS": "1", + "OPTION_TENSOR_PARALLEL_DEGREE": "8", + }, + "ModelDataSource": { + "S3DataSource": { + "S3Uri": ANY, + "S3DataType": "S3Prefix", + "CompressionType": "None", + "ModelAccessConfig": {"AcceptEula": True}, + } + }, + }, + vpc_config=None, + enable_network_isolation=False, # should be set to false + tags=ANY, + ) + mock_endpoint_from_production_variants.assert_called_once_with( + name=ANY, + production_variants=ANY, + tags=ANY, + kms_key=ANY, + vpc_config=ANY, + enable_network_isolation=False, + role=ANY, + live_logging=False, # this should be set to false for IC + wait=True, + ) + mock_create_inference_component.assert_called_once() + + +def test_js_model_with_optimize_quantization_on_pre_optimized_model_requests_are_expected( + sagemaker_session, +): + with patch.object( + Session, + "wait_for_optimization_job", + return_value={"OptimizationJobName": "mock_optimization_job"}, + ), patch.object( + Session, "create_model", return_value="mock_model" + ) as mock_create_model, patch.object( + Session, "endpoint_from_production_variants", return_value="mock_endpoint_name" + ) as mock_endpoint_from_production_variants: + iam_client = sagemaker_session.boto_session.client("iam") + role_arn = iam_client.get_role(RoleName=ROLE_NAME)["Role"]["Arn"] + + sagemaker_session.sagemaker_client.create_optimization_job = MagicMock() + + schema_builder = SchemaBuilder("test", "test") + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-1-8b-instruct", + schema_builder=schema_builder, + sagemaker_session=sagemaker_session, + role_arn=role_arn, + ) + + optimized_model = model_builder.optimize( + instance_type="ml.g5.xlarge", # set to small instance in case a network call is made + quantization_config={ + "OverrideEnvironment": { + "OPTION_QUANTIZE": "fp8", + }, + }, + accept_eula=True, + ) + + optimized_model.deploy() + + mock_create_model.assert_called_once_with( + name=ANY, + role=ANY, + container_defs={ + "Image": ANY, + "Environment": { + "SAGEMAKER_PROGRAM": "inference.py", + "ENDPOINT_SERVER_TIMEOUT": "3600", + "MODEL_CACHE_ROOT": "/opt/ml/model", + "SAGEMAKER_ENV": "1", + "HF_MODEL_ID": "/opt/ml/model", + "SAGEMAKER_MODEL_SERVER_WORKERS": "1", + "OPTION_QUANTIZE": "fp8", + }, + "ModelDataSource": { + "S3DataSource": { + "S3Uri": ANY, + "S3DataType": "S3Prefix", + "CompressionType": "None", + "ModelAccessConfig": {"AcceptEula": True}, + } + }, + }, + vpc_config=None, + enable_network_isolation=True, # should be set to false + tags=ANY, + ) + mock_endpoint_from_production_variants.assert_called_once() diff --git a/tests/unit/sagemaker/jumpstart/test_utils.py b/tests/unit/sagemaker/jumpstart/test_utils.py index 67681e2b7b..d228b4450e 100644 --- a/tests/unit/sagemaker/jumpstart/test_utils.py +++ b/tests/unit/sagemaker/jumpstart/test_utils.py @@ -2318,6 +2318,28 @@ def test_multiple_gated_additional_model_data_source_should_accept_both(self): + self.MOCK_GATED_DEPLOY_CONFIG_ADDITIONAL_MODEL_DATA_SOURCE_POST_CALL ) + def test_gated_additional_model_data_source_already_accepted_with_no_hosting_eula_key_should_pass_through( + self, + ): + mock_gated_deploy_config_additional_model_data_pre_accepted = [ + { + "ChannelName": "draft_model", + "S3DataSource": { + "CompressionType": "None", + "S3DataType": "S3Prefix", + "S3Uri": "s3://jumpstart_bucket/path/to/gated/resources/", + "ModelAccessConfig": {"AcceptEula": True}, + }, + } + ] + + utils._add_model_access_configs_to_model_data_sources( + model_data_sources=mock_gated_deploy_config_additional_model_data_pre_accepted, + model_access_configs={self.MOCK_GATED_MODEL_ID: ModelAccessConfig(accept_eula=False)}, + model_id=self.MOCK_GATED_MODEL_ID, + region=JUMPSTART_DEFAULT_REGION_NAME, + ) + # Mixed Positive Cases def test_multiple_mixed_additional_model_data_source_should_pass_through_one_accept_the_other( diff --git a/tests/unit/sagemaker/model/test_model.py b/tests/unit/sagemaker/model/test_model.py index 316df7420d..9175613662 100644 --- a/tests/unit/sagemaker/model/test_model.py +++ b/tests/unit/sagemaker/model/test_model.py @@ -1482,3 +1482,47 @@ def test_model_source( ) assert model_1._get_model_uri() == "s3://tmybuckaet" + + +@patch("sagemaker.utils.repack_model") +@patch("sagemaker.fw_utils.tar_and_upload_dir") +def test_deploy_sharded_model_with_cpus_requested_raises_warning( + repack_model, tar_and_upload_dir, sagemaker_session +): + framework_model_classes_to_kwargs = { + HuggingFaceModel: { + "pytorch_version": "1.7.1", + "py_version": "py36", + "transformers_version": "4.6.1", + }, + } + + sagemaker_session.settings = SessionSettings(include_jumpstart_tags=False) + + source_dir = "s3://blah/blah/blah" + for framework_model_class, kwargs in framework_model_classes_to_kwargs.items(): + test_sharded_model = framework_model_class( + entry_point=ENTRY_POINT_INFERENCE, + role=ROLE, + sagemaker_session=sagemaker_session, + model_data=source_dir, + **kwargs, + ) + test_sharded_model._is_sharded_model = True + from unittest import mock + + with mock.patch("sagemaker.model.logger") as mock_logger: + mock_logger.warning.reset_mock() + test_sharded_model.deploy( + instance_type="ml.m2.xlarge", + initial_instance_count=INSTANCE_COUNT, + endpoint_type=EndpointType.MODEL_BASED, + resources=ResourceRequirements( + requests={"num_accelerators": 1, "memory": 8192, "copies": 1, "num_cpus": 1}, + limits={}, + ), + ) + mock_logger.warning.assert_called_once_with( + "NumberOfCpuCoresRequired should be 0 for the best experience with SageMaker " + "Fast Model Loading. Configure by setting `num_cpus` to 0 in `resources`." + ) diff --git a/tests/unit/sagemaker/serve/builder/test_js_builder.py b/tests/unit/sagemaker/serve/builder/test_js_builder.py index 25bc67d22d..b6bd69e304 100644 --- a/tests/unit/sagemaker/serve/builder/test_js_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_js_builder.py @@ -1605,3 +1605,161 @@ def test_optimize_compile_for_jumpstart_without_compilation_config( self.assertEqual(optimized_model.env["SAGEMAKER_ENV"], "1") self.assertEqual(optimized_model.env["HF_MODEL_ID"], "/opt/ml/model") self.assertEqual(optimized_model.env["SAGEMAKER_MODEL_SERVER_WORKERS"], "1") + + +class TestJumpStartModelBuilderOptimizationUseCases(unittest.TestCase): + + @patch("sagemaker.serve.builder.jumpstart_builder._capture_telemetry", side_effect=None) + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + @patch( + "sagemaker.serve.builder.jumpstart_builder.JumpStart._is_gated_model", + return_value=True, + ) + @patch("sagemaker.serve.builder.jumpstart_builder.JumpStartModel") + @patch( + "sagemaker.serve.builder.jumpstart_builder.JumpStart._is_jumpstart_model_id", + return_value=True, + ) + @patch( + "sagemaker.serve.builder.jumpstart_builder.JumpStart._is_fine_tuned_model", + return_value=False, + ) + def test_optimize_on_js_model_should_ignore_pre_optimized_configurations( + self, + mock_is_fine_tuned, + mock_is_jumpstart_model, + mock_js_model, + mock_is_gated_model, + mock_serve_settings, + mock_telemetry, + ): + mock_sagemaker_session = Mock() + mock_sagemaker_session.wait_for_optimization_job.side_effect = ( + lambda *args: mock_optimization_job_response + ) + + mock_lmi_js_model = MagicMock() + mock_lmi_js_model.image_uri = mock_djl_image_uri + mock_lmi_js_model.env = { + "SAGEMAKER_PROGRAM": "inference.py", + "ENDPOINT_SERVER_TIMEOUT": "3600", + "MODEL_CACHE_ROOT": "/opt/ml/model", + "SAGEMAKER_ENV": "1", + "HF_MODEL_ID": "/opt/ml/model", + "OPTION_ENFORCE_EAGER": "true", + "SAGEMAKER_MODEL_SERVER_WORKERS": "1", + "OPTION_TENSOR_PARALLEL_DEGREE": "8", + } + + mock_js_model.return_value = mock_lmi_js_model + + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-1-70b-instruct", + schema_builder=SchemaBuilder("test", "test"), + sagemaker_session=mock_sagemaker_session, + ) + + optimized_model = model_builder.optimize( + accept_eula=True, + instance_type="ml.g5.24xlarge", + quantization_config={ + "OverrideEnvironment": { + "OPTION_QUANTIZE": "fp8", + "OPTION_TENSOR_PARALLEL_DEGREE": "4", + }, + }, + output_path="s3://bucket/code/", + ) + + assert mock_lmi_js_model.set_deployment_config.call_args_list[0].kwargs == { + "instance_type": "ml.g5.24xlarge", + "config_name": "lmi", + } + assert optimized_model.env == { + "SAGEMAKER_PROGRAM": "inference.py", + "ENDPOINT_SERVER_TIMEOUT": "3600", + "MODEL_CACHE_ROOT": "/opt/ml/model", + "SAGEMAKER_ENV": "1", + "HF_MODEL_ID": "/opt/ml/model", + "OPTION_ENFORCE_EAGER": "true", + "SAGEMAKER_MODEL_SERVER_WORKERS": "1", + "OPTION_TENSOR_PARALLEL_DEGREE": "4", # should be overridden from 8 to 4 + "OPTION_QUANTIZE": "fp8", # should be added to the env + } + + @patch("sagemaker.serve.builder.jumpstart_builder._capture_telemetry", side_effect=None) + @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) + @patch( + "sagemaker.serve.builder.jumpstart_builder.JumpStart._is_gated_model", + return_value=True, + ) + @patch("sagemaker.serve.builder.jumpstart_builder.JumpStartModel") + @patch( + "sagemaker.serve.builder.jumpstart_builder.JumpStart._is_jumpstart_model_id", + return_value=True, + ) + @patch( + "sagemaker.serve.builder.jumpstart_builder.JumpStart._is_fine_tuned_model", + return_value=False, + ) + def test_optimize_on_js_model_should_ignore_pre_optimized_configurations_no_override( + self, + mock_is_fine_tuned, + mock_is_jumpstart_model, + mock_js_model, + mock_is_gated_model, + mock_serve_settings, + mock_telemetry, + ): + mock_sagemaker_session = Mock() + mock_sagemaker_session.wait_for_optimization_job.side_effect = ( + lambda *args: mock_optimization_job_response + ) + + mock_lmi_js_model = MagicMock() + mock_lmi_js_model.image_uri = mock_djl_image_uri + mock_lmi_js_model.env = { + "SAGEMAKER_PROGRAM": "inference.py", + "ENDPOINT_SERVER_TIMEOUT": "3600", + "MODEL_CACHE_ROOT": "/opt/ml/model", + "SAGEMAKER_ENV": "1", + "HF_MODEL_ID": "/opt/ml/model", + "OPTION_ENFORCE_EAGER": "true", + "SAGEMAKER_MODEL_SERVER_WORKERS": "1", + "OPTION_TENSOR_PARALLEL_DEGREE": "8", + } + + mock_js_model.return_value = mock_lmi_js_model + + model_builder = ModelBuilder( + model="meta-textgeneration-llama-3-1-70b-instruct", + schema_builder=SchemaBuilder("test", "test"), + sagemaker_session=mock_sagemaker_session, + ) + + optimized_model = model_builder.optimize( + accept_eula=True, + instance_type="ml.g5.24xlarge", + quantization_config={ + "OverrideEnvironment": { + "OPTION_QUANTIZE": "fp8", + }, + }, + output_path="s3://bucket/code/", + ) + + assert mock_lmi_js_model.set_deployment_config.call_args_list[0].kwargs == { + "instance_type": "ml.g5.24xlarge", + "config_name": "lmi", + } + assert optimized_model.env == { + "SAGEMAKER_PROGRAM": "inference.py", + "ENDPOINT_SERVER_TIMEOUT": "3600", + "MODEL_CACHE_ROOT": "/opt/ml/model", + "SAGEMAKER_ENV": "1", + "HF_MODEL_ID": "/opt/ml/model", + "OPTION_ENFORCE_EAGER": "true", + "SAGEMAKER_MODEL_SERVER_WORKERS": "1", + "OPTION_TENSOR_PARALLEL_DEGREE": "8", + "OPTION_QUANTIZE": "fp8", # should be added to the env + }