diff --git a/src/sagemaker/huggingface/model.py b/src/sagemaker/huggingface/model.py index 05b981d21b..3ca25fb3ce 100644 --- a/src/sagemaker/huggingface/model.py +++ b/src/sagemaker/huggingface/model.py @@ -218,6 +218,7 @@ def deploy( container_startup_health_check_timeout=None, inference_recommendation_id=None, explainer_config=None, + update_endpoint: Optional[bool] = False, **kwargs, ): """Deploy this ``Model`` to an ``Endpoint`` and optionally return a ``Predictor``. @@ -296,6 +297,11 @@ def deploy( would like to deploy the model and endpoint with recommended parameters. explainer_config (sagemaker.explainer.ExplainerConfig): Specifies online explainability configuration for use with Amazon SageMaker Clarify. (default: None) + update_endpoint (Optional[bool]): + Flag to update the model in an existing Amazon SageMaker endpoint. + If True, this will deploy a new EndpointConfig to an already existing endpoint + and delete resources corresponding to the previous EndpointConfig. Default: False + Note: Currently this is supported for single model endpoints Raises: ValueError: If arguments combination check failed in these circumstances: - If no role is specified or @@ -335,6 +341,7 @@ def deploy( container_startup_health_check_timeout=container_startup_health_check_timeout, inference_recommendation_id=inference_recommendation_id, explainer_config=explainer_config, + update_endpoint=update_endpoint, **kwargs, ) diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index e5ea1ea314..b281d9f489 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -53,7 +53,6 @@ from sagemaker.model_card.schema_constraints import ModelApprovalStatusEnum from sagemaker.session import Session from sagemaker.model_metrics import ModelMetrics -from sagemaker.deprecations import removed_kwargs from sagemaker.drift_check_baselines import DriftCheckBaselines from sagemaker.explainer import ExplainerConfig from sagemaker.metadata_properties import MetadataProperties @@ -1386,6 +1385,7 @@ def deploy( routing_config: Optional[Dict[str, Any]] = None, model_reference_arn: Optional[str] = None, inference_ami_version: Optional[str] = None, + update_endpoint: Optional[bool] = False, **kwargs, ): """Deploy this ``Model`` to an ``Endpoint`` and optionally return a ``Predictor``. @@ -1497,6 +1497,11 @@ def deploy( inference_ami_version (Optional [str]): Specifies an option from a collection of preconfigured Amazon Machine Image (AMI) images. For a full list of options, see: https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html + update_endpoint (Optional[bool]): + Flag to update the model in an existing Amazon SageMaker endpoint. + If True, this will deploy a new EndpointConfig to an already existing endpoint + and delete resources corresponding to the previous EndpointConfig. Default: False + Note: Currently this is supported for single model endpoints Raises: ValueError: If arguments combination check failed in these circumstances: - If no role is specified or @@ -1512,8 +1517,6 @@ def deploy( """ self.accept_eula = accept_eula - removed_kwargs("update_endpoint", kwargs) - self._init_sagemaker_session_if_does_not_exist(instance_type) # Depending on the instance type, a local session (or) a session is initialized. self.role = resolve_value_from_config( @@ -1628,6 +1631,10 @@ def deploy( # Support multiple models on same endpoint if endpoint_type == EndpointType.INFERENCE_COMPONENT_BASED: + if update_endpoint: + raise ValueError( + "Currently update_endpoint is supported for single model endpoints" + ) if endpoint_name: self.endpoint_name = endpoint_name else: @@ -1783,17 +1790,38 @@ def deploy( if is_explainer_enabled: explainer_config_dict = explainer_config._to_request_dict() - self.sagemaker_session.endpoint_from_production_variants( - name=self.endpoint_name, - production_variants=[production_variant], - tags=tags, - kms_key=kms_key, - wait=wait, - data_capture_config_dict=data_capture_config_dict, - explainer_config_dict=explainer_config_dict, - async_inference_config_dict=async_inference_config_dict, - live_logging=endpoint_logging, - ) + if update_endpoint: + endpoint_config_name = self.sagemaker_session.create_endpoint_config( + name=self.name, + model_name=self.name, + initial_instance_count=initial_instance_count, + instance_type=instance_type, + accelerator_type=accelerator_type, + tags=tags, + kms_key=kms_key, + data_capture_config_dict=data_capture_config_dict, + volume_size=volume_size, + model_data_download_timeout=model_data_download_timeout, + container_startup_health_check_timeout=container_startup_health_check_timeout, + explainer_config_dict=explainer_config_dict, + async_inference_config_dict=async_inference_config_dict, + serverless_inference_config=serverless_inference_config_dict, + routing_config=routing_config, + inference_ami_version=inference_ami_version, + ) + self.sagemaker_session.update_endpoint(self.endpoint_name, endpoint_config_name) + else: + self.sagemaker_session.endpoint_from_production_variants( + name=self.endpoint_name, + production_variants=[production_variant], + tags=tags, + kms_key=kms_key, + wait=wait, + data_capture_config_dict=data_capture_config_dict, + explainer_config_dict=explainer_config_dict, + async_inference_config_dict=async_inference_config_dict, + live_logging=endpoint_logging, + ) if self.predictor_cls: predictor = self.predictor_cls(self.endpoint_name, self.sagemaker_session) diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index a7a518105c..9122f22e44 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -1602,6 +1602,7 @@ def deploy( ResourceRequirements, ] ] = None, + update_endpoint: Optional[bool] = False, ) -> Union[Predictor, Transformer]: """Deploys the built Model. @@ -1615,24 +1616,33 @@ def deploy( AsyncInferenceConfig, BatchTransformInferenceConfig, ResourceRequirements]]) : Additional Config for different deployment types such as serverless, async, batch and multi-model/container + update_endpoint (Optional[bool]): + Flag to update the model in an existing Amazon SageMaker endpoint. + If True, this will deploy a new EndpointConfig to an already existing endpoint + and delete resources corresponding to the previous EndpointConfig. Default: False + Note: Currently this is supported for single model endpoints Returns: Transformer for Batch Deployments Predictors for all others """ if not hasattr(self, "built_model"): raise ValueError("Model Needs to be built before deploying") - endpoint_name = unique_name_from_base(endpoint_name) + if not update_endpoint: + endpoint_name = unique_name_from_base(endpoint_name) + if not inference_config: # Real-time Deployment return self.built_model.deploy( instance_type=self.instance_type, initial_instance_count=initial_instance_count, endpoint_name=endpoint_name, + update_endpoint=update_endpoint, ) if isinstance(inference_config, ServerlessInferenceConfig): return self.built_model.deploy( serverless_inference_config=inference_config, endpoint_name=endpoint_name, + update_endpoint=update_endpoint, ) if isinstance(inference_config, AsyncInferenceConfig): @@ -1641,6 +1651,7 @@ def deploy( initial_instance_count=initial_instance_count, async_inference_config=inference_config, endpoint_name=endpoint_name, + update_endpoint=update_endpoint, ) if isinstance(inference_config, BatchTransformInferenceConfig): @@ -1652,6 +1663,10 @@ def deploy( return transformer if isinstance(inference_config, ResourceRequirements): + if update_endpoint: + raise ValueError( + "Currently update_endpoint is supported for single model endpoints" + ) # Multi Model and MultiContainer endpoints with Inference Component return self.built_model.deploy( instance_type=self.instance_type, @@ -1660,6 +1675,7 @@ def deploy( resources=inference_config, initial_instance_count=initial_instance_count, role=self.role_arn, + update_endpoint=update_endpoint, ) raise ValueError("Deployment Options not supported") diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index b2398e03d1..38fa7f8c26 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -4488,6 +4488,10 @@ def create_endpoint_config( model_data_download_timeout=None, container_startup_health_check_timeout=None, explainer_config_dict=None, + async_inference_config_dict=None, + serverless_inference_config_dict=None, + routing_config: Optional[Dict[str, Any]] = None, + inference_ami_version: Optional[str] = None, ): """Create an Amazon SageMaker endpoint configuration. @@ -4525,6 +4529,30 @@ def create_endpoint_config( -inference-algo-ping-requests explainer_config_dict (dict): Specifies configuration to enable explainers. Default: None. + async_inference_config_dict (dict): Specifies + configuration related to async endpoint. Use this configuration when trying + to create async endpoint and make async inference. If empty config object + passed through, will use default config to deploy async endpoint. Deploy a + real-time endpoint if it's None. (default: None). + serverless_inference_config_dict (dict): + Specifies configuration related to serverless endpoint. Use this configuration + when trying to create serverless endpoint and make serverless inference. If + empty object passed through, will use pre-defined values in + ``ServerlessInferenceConfig`` class to deploy serverless endpoint. Deploy an + instance based endpoint if it's None. (default: None). + routing_config (Optional[Dict[str, Any]): Settings the control how the endpoint routes + incoming traffic to the instances that the endpoint hosts. + Currently, support dictionary key ``RoutingStrategy``. + + .. code:: python + + { + "RoutingStrategy": sagemaker.enums.RoutingStrategy.RANDOM + } + inference_ami_version (Optional [str]): + Specifies an option from a collection of preconfigured + Amazon Machine Image (AMI) images. For a full list of options, see: + https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html Example: >>> tags = [{'Key': 'tagname', 'Value': 'tagvalue'}] @@ -4544,9 +4572,12 @@ def create_endpoint_config( instance_type, initial_instance_count, accelerator_type=accelerator_type, + serverless_inference_config=serverless_inference_config_dict, volume_size=volume_size, model_data_download_timeout=model_data_download_timeout, container_startup_health_check_timeout=container_startup_health_check_timeout, + routing_config=routing_config, + inference_ami_version=inference_ami_version, ) production_variants = [provided_production_variant] # Currently we just inject CoreDumpConfig.KmsKeyId from the config for production variant. @@ -4586,6 +4617,14 @@ def create_endpoint_config( ) request["DataCaptureConfig"] = inferred_data_capture_config_dict + if async_inference_config_dict is not None: + inferred_async_inference_config_dict = update_nested_dictionary_with_values_from_config( + async_inference_config_dict, + ENDPOINT_CONFIG_ASYNC_INFERENCE_PATH, + sagemaker_session=self, + ) + request["AsyncInferenceConfig"] = inferred_async_inference_config_dict + if explainer_config_dict is not None: request["ExplainerConfig"] = explainer_config_dict diff --git a/src/sagemaker/tensorflow/model.py b/src/sagemaker/tensorflow/model.py index c7f624114f..b384cbbbb5 100644 --- a/src/sagemaker/tensorflow/model.py +++ b/src/sagemaker/tensorflow/model.py @@ -358,6 +358,7 @@ def deploy( container_startup_health_check_timeout=None, inference_recommendation_id=None, explainer_config=None, + update_endpoint: Optional[bool] = False, **kwargs, ): """Deploy a Tensorflow ``Model`` to a SageMaker ``Endpoint``.""" @@ -383,6 +384,7 @@ def deploy( container_startup_health_check_timeout=container_startup_health_check_timeout, inference_recommendation_id=inference_recommendation_id, explainer_config=explainer_config, + update_endpoint=update_endpoint, **kwargs, ) diff --git a/tests/unit/sagemaker/jumpstart/model/test_model.py b/tests/unit/sagemaker/jumpstart/model/test_model.py index be961828f4..d9b126f651 100644 --- a/tests/unit/sagemaker/jumpstart/model/test_model.py +++ b/tests/unit/sagemaker/jumpstart/model/test_model.py @@ -794,7 +794,7 @@ def test_jumpstart_model_kwargs_match_parent_class(self): and reach out to JumpStart team.""" init_args_to_skip: Set[str] = set(["model_reference_arn"]) - deploy_args_to_skip: Set[str] = set(["kwargs", "model_reference_arn"]) + deploy_args_to_skip: Set[str] = set(["kwargs", "model_reference_arn", "update_endpoint"]) deploy_args_removed_at_deploy_time: Set[str] = set(["model_access_configs"]) parent_class_init = Model.__init__ diff --git a/tests/unit/sagemaker/model/test_deploy.py b/tests/unit/sagemaker/model/test_deploy.py index 7b99281b96..4167ca62c3 100644 --- a/tests/unit/sagemaker/model/test_deploy.py +++ b/tests/unit/sagemaker/model/test_deploy.py @@ -23,6 +23,7 @@ from sagemaker.serverless import ServerlessInferenceConfig from sagemaker.explainer import ExplainerConfig from sagemaker.compute_resource_requirements.resource_requirements import ResourceRequirements +from sagemaker.enums import EndpointType from tests.unit.sagemaker.inference_recommender.constants import ( DESCRIBE_COMPILATION_JOB_RESPONSE, DESCRIBE_MODEL_PACKAGE_RESPONSE, @@ -1051,3 +1052,143 @@ def test_deploy_with_name_and_resources(sagemaker_session): async_inference_config_dict=None, live_logging=False, ) + + +@patch("sagemaker.model.Model._create_sagemaker_model", Mock()) +@patch("sagemaker.utils.name_from_base", return_value=ENDPOINT_NAME) +@patch("sagemaker.production_variant", return_value=BASE_PRODUCTION_VARIANT) +def test_deploy_with_update_endpoint(production_variant, name_from_base, sagemaker_session): + model = Model( + MODEL_IMAGE, MODEL_DATA, role=ROLE, name=MODEL_NAME, sagemaker_session=sagemaker_session + ) + + # Mock the create_endpoint_config to return a specific config name + endpoint_config_name = "test-config-name" + sagemaker_session.create_endpoint_config.return_value = endpoint_config_name + + # Test update_endpoint=True scenario + endpoint_name = "existing-endpoint" + model.deploy( + instance_type=INSTANCE_TYPE, + initial_instance_count=INSTANCE_COUNT, + endpoint_name=endpoint_name, + update_endpoint=True, + ) + + # Verify create_endpoint_config is called with correct parameters + sagemaker_session.create_endpoint_config.assert_called_with( + name=MODEL_NAME, + model_name=MODEL_NAME, + initial_instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + accelerator_type=None, + tags=None, + kms_key=None, + data_capture_config_dict=None, + volume_size=None, + model_data_download_timeout=None, + container_startup_health_check_timeout=None, + explainer_config_dict=None, + async_inference_config_dict=None, + serverless_inference_config=None, + routing_config=None, + inference_ami_version=None, + ) + + # Verify update_endpoint is called with correct parameters + sagemaker_session.update_endpoint.assert_called_with(endpoint_name, endpoint_config_name) + + # Test update_endpoint with serverless config + serverless_inference_config = ServerlessInferenceConfig() + serverless_inference_config_dict = { + "MemorySizeInMB": 2048, + "MaxConcurrency": 5, + } + model.deploy( + endpoint_name=endpoint_name, + update_endpoint=True, + serverless_inference_config=serverless_inference_config, + ) + + sagemaker_session.create_endpoint_config.assert_called_with( + name=MODEL_NAME, + model_name=MODEL_NAME, + initial_instance_count=None, + instance_type=None, + accelerator_type=None, + tags=None, + kms_key=None, + data_capture_config_dict=None, + volume_size=None, + model_data_download_timeout=None, + container_startup_health_check_timeout=None, + explainer_config_dict=None, + async_inference_config_dict=None, + serverless_inference_config=serverless_inference_config_dict, + routing_config=None, + inference_ami_version=None, + ) + + # Verify update_endpoint is called with the new config + sagemaker_session.update_endpoint.assert_called_with(endpoint_name, endpoint_config_name) + + # Test update_endpoint with async inference config + async_inference_config = AsyncInferenceConfig( + output_path="s3://bucket/output", failure_path="s3://bucket/failure" + ) + async_inference_config_dict = { + "OutputConfig": { + "S3OutputPath": "s3://bucket/output", + "S3FailurePath": "s3://bucket/failure", + }, + } + model.deploy( + endpoint_name=endpoint_name, + instance_type=INSTANCE_TYPE, + initial_instance_count=INSTANCE_COUNT, + update_endpoint=True, + async_inference_config=async_inference_config, + ) + + sagemaker_session.create_endpoint_config.assert_called_with( + name=MODEL_NAME, + model_name=MODEL_NAME, + initial_instance_count=INSTANCE_COUNT, + instance_type=INSTANCE_TYPE, + accelerator_type=None, + tags=None, + kms_key=None, + data_capture_config_dict=None, + volume_size=None, + model_data_download_timeout=None, + container_startup_health_check_timeout=None, + explainer_config_dict=None, + async_inference_config_dict=async_inference_config_dict, + serverless_inference_config=None, + routing_config=None, + inference_ami_version=None, + ) + + # Verify update_endpoint is called with the new config + sagemaker_session.update_endpoint.assert_called_with(endpoint_name, endpoint_config_name) + + +@patch("sagemaker.model.Model._create_sagemaker_model", Mock()) +@patch("sagemaker.production_variant", return_value=BASE_PRODUCTION_VARIANT) +def test_deploy_with_update_endpoint_inference_component(production_variant, sagemaker_session): + model = Model( + MODEL_IMAGE, MODEL_DATA, role=ROLE, name=MODEL_NAME, sagemaker_session=sagemaker_session + ) + + # Test that updating endpoint with inference component raises error + with pytest.raises( + ValueError, match="Currently update_endpoint is supported for single model endpoints" + ): + model.deploy( + endpoint_name="test-endpoint", + instance_type=INSTANCE_TYPE, + initial_instance_count=INSTANCE_COUNT, + update_endpoint=True, + resources=RESOURCES, + endpoint_type=EndpointType.INFERENCE_COMPONENT_BASED, + ) diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index 107d65c301..6661c6e2bf 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -4041,14 +4041,30 @@ def test_neuron_configurations_rule_set(self): @pytest.mark.parametrize( "test_case", [ + # Real-time deployment without update { "input_args": {"endpoint_name": "test"}, "call_params": { "instance_type": "ml.g5.2xlarge", "initial_instance_count": 1, "endpoint_name": "test", + "update_endpoint": False, }, }, + # Real-time deployment with update + { + "input_args": { + "endpoint_name": "existing-endpoint", + "update_endpoint": True, + }, + "call_params": { + "instance_type": "ml.g5.2xlarge", + "initial_instance_count": 1, + "endpoint_name": "existing-endpoint", + "update_endpoint": True, + }, + }, + # Serverless deployment without update { "input_args": { "endpoint_name": "test", @@ -4057,8 +4073,23 @@ def test_neuron_configurations_rule_set(self): "call_params": { "serverless_inference_config": ServerlessInferenceConfig(), "endpoint_name": "test", + "update_endpoint": False, }, }, + # Serverless deployment with update + { + "input_args": { + "endpoint_name": "existing-endpoint", + "inference_config": ServerlessInferenceConfig(), + "update_endpoint": True, + }, + "call_params": { + "serverless_inference_config": ServerlessInferenceConfig(), + "endpoint_name": "existing-endpoint", + "update_endpoint": True, + }, + }, + # Async deployment without update { "input_args": { "endpoint_name": "test", @@ -4069,10 +4100,30 @@ def test_neuron_configurations_rule_set(self): "instance_type": "ml.g5.2xlarge", "initial_instance_count": 1, "endpoint_name": "test", + "update_endpoint": False, }, }, + # Async deployment with update { - "input_args": {"endpoint_name": "test", "inference_config": RESOURCE_REQUIREMENTS}, + "input_args": { + "endpoint_name": "existing-endpoint", + "inference_config": AsyncInferenceConfig(output_path="op-path"), + "update_endpoint": True, + }, + "call_params": { + "async_inference_config": AsyncInferenceConfig(output_path="op-path"), + "instance_type": "ml.g5.2xlarge", + "initial_instance_count": 1, + "endpoint_name": "existing-endpoint", + "update_endpoint": True, + }, + }, + # Multi-Model deployment (update_endpoint not supported) + { + "input_args": { + "endpoint_name": "test", + "inference_config": RESOURCE_REQUIREMENTS, + }, "call_params": { "resources": RESOURCE_REQUIREMENTS, "role": "role-arn", @@ -4080,8 +4131,10 @@ def test_neuron_configurations_rule_set(self): "instance_type": "ml.g5.2xlarge", "mode": Mode.SAGEMAKER_ENDPOINT, "endpoint_type": EndpointType.INFERENCE_COMPONENT_BASED, + "update_endpoint": False, }, }, + # Batch transform { "input_args": { "inference_config": BatchTransformInferenceConfig( @@ -4096,7 +4149,16 @@ def test_neuron_configurations_rule_set(self): "id": "Batch", }, ], - ids=["Real Time", "Serverless", "Async", "Multi-Model", "Batch"], + ids=[ + "Real Time", + "Real Time Update", + "Serverless", + "Serverless Update", + "Async", + "Async Update", + "Multi-Model", + "Batch", + ], ) @patch("sagemaker.serve.builder.model_builder.unique_name_from_base") def test_deploy(mock_unique_name_from_base, test_case): @@ -4119,3 +4181,20 @@ def test_deploy(mock_unique_name_from_base, test_case): diff = deepdiff.DeepDiff(kwargs, test_case["call_params"]) assert diff == {} + + +def test_deploy_multi_model_update_error(): + model_builder = ModelBuilder( + model="meta-llama/Meta-Llama-3-8B-Instruct", + env_vars={"HUGGING_FACE_HUB_TOKEN": "token"}, + role_arn="role-arn", + instance_type="ml.g5.2xlarge", + ) + setattr(model_builder, "built_model", MagicMock()) + + with pytest.raises( + ValueError, match="Currently update_endpoint is supported for single model endpoints" + ): + model_builder.deploy( + endpoint_name="test", inference_config=RESOURCE_REQUIREMENTS, update_endpoint=True + )