diff --git a/src/sagemaker/serve/builder/jumpstart_builder.py b/src/sagemaker/serve/builder/jumpstart_builder.py index 07885792d2..eb57dec1fa 100644 --- a/src/sagemaker/serve/builder/jumpstart_builder.py +++ b/src/sagemaker/serve/builder/jumpstart_builder.py @@ -669,7 +669,6 @@ def _optimize_for_jumpstart( self, output_path: Optional[str] = None, instance_type: Optional[str] = None, - role_arn: Optional[str] = None, tags: Optional[Tags] = None, job_name: Optional[str] = None, accept_eula: Optional[bool] = None, @@ -685,9 +684,7 @@ def _optimize_for_jumpstart( Args: output_path (Optional[str]): Specifies where to store the compiled/quantized model. - instance_type (Optional[str]): Target deployment instance type that - the model is optimized for. - role_arn (Optional[str]): Execution role. Defaults to ``None``. + instance_type (str): Target deployment instance type that the model is optimized for. tags (Optional[Tags]): Tags for labeling a model optimization job. Defaults to ``None``. job_name (Optional[str]): The name of the model optimization job. Defaults to ``None``. accept_eula (bool): For models that require a Model Access Config, specify True or @@ -715,7 +712,7 @@ def _optimize_for_jumpstart( f"Model '{self.model}' requires accepting end-user license agreement (EULA)." ) - is_compilation = (quantization_config is None) and ( + is_compilation = (not quantization_config) and ( (compilation_config is not None) or _is_inferentia_or_trainium(instance_type) ) @@ -758,7 +755,6 @@ def _optimize_for_jumpstart( else None ) self.instance_type = instance_type or deployment_config_instance_type or _get_nb_instance() - self.role_arn = role_arn or self.role_arn create_optimization_job_args = { "OptimizationJobName": job_name, @@ -787,10 +783,10 @@ def _optimize_for_jumpstart( "AcceptEula": True } + optimization_env_vars = _update_environment_variables(optimization_env_vars, override_env) + if optimization_env_vars: + self.pysdk_model.env.update(optimization_env_vars) if quantization_config or is_compilation: - self.pysdk_model.env = _update_environment_variables( - optimization_env_vars, override_env - ) return create_optimization_job_args return None diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index 01b2b96f68..fb6f60b9d0 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -73,7 +73,6 @@ _generate_model_source, _extract_optimization_config_and_env, _is_s3_uri, - _normalize_local_model_path, _custom_speculative_decoding, _extract_speculative_draft_model_provider, ) @@ -833,6 +832,8 @@ def build( # pylint: disable=R0911 # until we deprecate HUGGING_FACE_HUB_TOKEN. if self.env_vars.get("HUGGING_FACE_HUB_TOKEN") and not self.env_vars.get("HF_TOKEN"): self.env_vars["HF_TOKEN"] = self.env_vars.get("HUGGING_FACE_HUB_TOKEN") + elif self.env_vars.get("HF_TOKEN") and not self.env_vars.get("HUGGING_FACE_HUB_TOKEN"): + self.env_vars["HUGGING_FACE_HUB_TOKEN"] = self.env_vars.get("HF_TOKEN") self.sagemaker_session.settings._local_download_dir = self.model_path @@ -851,7 +852,9 @@ def build( # pylint: disable=R0911 self._build_validations() - if not self._is_jumpstart_model_id() and self.model_server: + if ( + not (isinstance(self.model, str) and self._is_jumpstart_model_id()) + ) and self.model_server: return self._build_for_model_server() if isinstance(self.model, str): @@ -1216,18 +1219,15 @@ def _model_builder_optimize_wrapper( raise ValueError("Quantization config and compilation config are mutually exclusive.") self.sagemaker_session = sagemaker_session or self.sagemaker_session or Session() - self.instance_type = instance_type or self.instance_type self.role_arn = role_arn or self.role_arn - self.build(mode=self.mode, sagemaker_session=self.sagemaker_session) job_name = job_name or f"modelbuilderjob-{uuid.uuid4().hex}" - if self._is_jumpstart_model_id(): + self.build(mode=self.mode, sagemaker_session=self.sagemaker_session) input_args = self._optimize_for_jumpstart( output_path=output_path, instance_type=instance_type, - role_arn=self.role_arn, tags=tags, job_name=job_name, accept_eula=accept_eula, @@ -1240,10 +1240,13 @@ def _model_builder_optimize_wrapper( max_runtime_in_sec=max_runtime_in_sec, ) else: + if self.model_server != ModelServer.DJL_SERVING: + logger.info("Overriding model server to DJL_SERVING.") + self.model_server = ModelServer.DJL_SERVING + + self.build(mode=self.mode, sagemaker_session=self.sagemaker_session) input_args = self._optimize_for_hf( output_path=output_path, - instance_type=instance_type, - role_arn=self.role_arn, tags=tags, job_name=job_name, quantization_config=quantization_config, @@ -1269,8 +1272,6 @@ def _model_builder_optimize_wrapper( def _optimize_for_hf( self, output_path: str, - instance_type: Optional[str] = None, - role_arn: Optional[str] = None, tags: Optional[Tags] = None, job_name: Optional[str] = None, quantization_config: Optional[Dict] = None, @@ -1285,9 +1286,6 @@ def _optimize_for_hf( Args: output_path (str): Specifies where to store the compiled/quantized model. - instance_type (Optional[str]): Target deployment instance type that - the model is optimized for. - role_arn (Optional[str]): Execution role. Defaults to ``None``. tags (Optional[Tags]): Tags for labeling a model optimization job. Defaults to ``None``. job_name (Optional[str]): The name of the model optimization job. Defaults to ``None``. quantization_config (Optional[Dict]): Quantization configuration. Defaults to ``None``. @@ -1305,13 +1303,6 @@ def _optimize_for_hf( Returns: Optional[Dict[str, Any]]: Model optimization job input arguments. """ - if self.model_server != ModelServer.DJL_SERVING: - logger.info("Overwriting model server to DJL.") - self.model_server = ModelServer.DJL_SERVING - - self.role_arn = role_arn or self.role_arn - self.instance_type = instance_type or self.instance_type - self.pysdk_model = _custom_speculative_decoding( self.pysdk_model, speculative_decoding_config, False ) @@ -1371,13 +1362,12 @@ def _optimize_prepare_for_hf(self): ) else: if not custom_model_path: - custom_model_path = f"/tmp/sagemaker/model-builder/{self.model}/code" + custom_model_path = f"/tmp/sagemaker/model-builder/{self.model}" download_huggingface_model_metadata( self.model, - custom_model_path, + os.path.join(custom_model_path, "code"), self.env_vars.get("HUGGING_FACE_HUB_TOKEN"), ) - custom_model_path = _normalize_local_model_path(custom_model_path) self.pysdk_model.model_data, env = self._prepare_for_mode( model_path=custom_model_path, diff --git a/src/sagemaker/serve/utils/optimize_utils.py b/src/sagemaker/serve/utils/optimize_utils.py index 35a937407e..5781c0bade 100644 --- a/src/sagemaker/serve/utils/optimize_utils.py +++ b/src/sagemaker/serve/utils/optimize_utils.py @@ -282,26 +282,6 @@ def _extract_optimization_config_and_env( return None, None -def _normalize_local_model_path(local_model_path: Optional[str]) -> Optional[str]: - """Normalizes the local model path. - - Args: - local_model_path (Optional[str]): The local model path. - - Returns: - Optional[str]: The normalized model path. - """ - if local_model_path is None: - return local_model_path - - # Removes /code or /code/ path at the end of local_model_path, - # as it is appended during artifacts upload. - pattern = r"/code/?$" - if re.search(pattern, local_model_path): - return re.sub(pattern, "", local_model_path) - return local_model_path - - def _custom_speculative_decoding( model: Model, speculative_decoding_config: Optional[Dict], diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index 81d57243ea..4818b9d8b6 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -13,12 +13,11 @@ from __future__ import absolute_import from unittest.mock import MagicMock, patch, Mock, mock_open -import pytest - import unittest from pathlib import Path from copy import deepcopy +from sagemaker.serve import SchemaBuilder from sagemaker.serve.builder.model_builder import ModelBuilder from sagemaker.serve.mode.function_pointers import Mode from sagemaker.serve.model_format.mlflow.constants import MLFLOW_TRACKING_ARN @@ -2328,22 +2327,52 @@ def test_build_tensorflow_serving_non_mlflow_case( mock_session, ) - @pytest.mark.skip(reason="Implementation not completed") + @patch.object(ModelBuilder, "_prepare_for_mode") + @patch.object(ModelBuilder, "_build_for_djl") + @patch.object(ModelBuilder, "_is_jumpstart_model_id", return_value=False) @patch.object(ModelBuilder, "_get_serve_setting", autospec=True) @patch("sagemaker.serve.utils.telemetry_logger._send_telemetry") - def test_optimize(self, mock_send_telemetry, mock_get_serve_setting): + def test_optimize( + self, + mock_send_telemetry, + mock_get_serve_setting, + mock_is_jumpstart_model_id, + mock_build_for_djl, + mock_prepare_for_mode, + ): mock_sagemaker_session = Mock() mock_settings = Mock() mock_settings.telemetry_opt_out = False mock_get_serve_setting.return_value = mock_settings + pysdk_model = Mock() + pysdk_model.env = {"key": "val"} + pysdk_model.add_tags.side_effect = lambda *arg, **kwargs: None + + mock_build_for_djl.side_effect = lambda **kwargs: pysdk_model + mock_prepare_for_mode.side_effect = lambda *args, **kwargs: ( + { + "S3DataSource": { + "S3Uri": "s3://uri", + "S3DataType": "S3Prefix", + "CompressionType": "None", + } + }, + {"key": "val"}, + ) + builder = ModelBuilder( - model_path=MODEL_PATH, - schema_builder=schema_builder, - model=mock_fw_model, + schema_builder=SchemaBuilder( + sample_input={"inputs": "Hello", "parameters": {}}, + sample_output=[{"generated_text": "Hello"}], + ), + model="meta-llama/Meta-Llama-3-8B", sagemaker_session=mock_sagemaker_session, + env_vars={"HF_TOKEN": "token"}, + model_metadata={"CUSTOM_MODEL_PATH": "/tmp/modelbuilders/code"}, ) + builder.pysdk_model = pysdk_model job_name = "my-optimization-job" instance_type = "ml.inf1.xlarge" @@ -2352,10 +2381,6 @@ def test_optimize(self, mock_send_telemetry, mock_get_serve_setting): "Image": "quantization-image-uri", "OverrideEnvironment": {"ENV_VAR": "value"}, } - compilation_config = { - "Image": "compilation-image-uri", - "OverrideEnvironment": {"ENV_VAR": "value"}, - } env_vars = {"Var1": "value", "Var2": "value"} kms_key = "arn:aws:kms:us-west-2:123456789012:key/my-key-id" max_runtime_in_sec = 3600 @@ -2368,36 +2393,17 @@ def test_optimize(self, mock_send_telemetry, mock_get_serve_setting): "Subnets": ["subnet-01234567", "subnet-89abcdef"], } - expected_create_optimization_job_args = { - "ModelSource": {"S3": {"S3Uri": MODEL_PATH, "ModelAccessConfig": {"AcceptEula": True}}}, - "DeploymentInstanceType": instance_type, - "OptimizationEnvironment": env_vars, - "OptimizationConfigs": [ - {"ModelQuantizationConfig": quantization_config}, - {"ModelCompilationConfig": compilation_config}, - ], - "OutputConfig": {"S3OutputLocation": output_path, "KmsKeyId": kms_key}, - "RoleArn": mock_role_arn, - "OptimizationJobName": job_name, - "StoppingCondition": {"MaxRuntimeInSeconds": max_runtime_in_sec}, - "Tags": [ - {"Key": "Project", "Value": "my-project"}, - {"Key": "Environment", "Value": "production"}, - ], - "VpcConfig": vpc_config, - } - - mock_sagemaker_session.sagemaker_client.create_optimization_job.return_value = { - "OptimizationJobArn": "arn:aws:sagemaker:us-west-2:123456789012:optimization-job/my-optimization-job" + mock_sagemaker_session.wait_for_optimization_job.side_effect = lambda *args, **kwargs: { + "OptimizationJobArn": "arn:aws:sagemaker:us-west-2:123456789012:optimization-job/my-optimization-job", + "OptimizationJobName": "my-optimization-job", } builder.optimize( instance_type=instance_type, output_path=output_path, - role=mock_role_arn, + role_arn=mock_role_arn, job_name=job_name, quantization_config=quantization_config, - compilation_config=compilation_config, env_vars=env_vars, kms_key=kms_key, max_runtime_in_sec=max_runtime_in_sec, @@ -2405,9 +2411,37 @@ def test_optimize(self, mock_send_telemetry, mock_get_serve_setting): vpc_config=vpc_config, ) + self.assertEqual(builder.env_vars["HUGGING_FACE_HUB_TOKEN"], "token") + self.assertEqual(builder.model_server, ModelServer.DJL_SERVING) + mock_send_telemetry.assert_called_once() mock_sagemaker_session.sagemaker_client.create_optimization_job.assert_called_once_with( - **expected_create_optimization_job_args + OptimizationJobName="my-optimization-job", + DeploymentInstanceType="ml.inf1.xlarge", + RoleArn="arn:aws:iam::123456789012:role/SageMakerRole", + OptimizationEnvironment={"Var1": "value", "Var2": "value"}, + ModelSource={"S3": {"S3Uri": "s3://uri"}}, + OptimizationConfigs=[ + { + "ModelQuantizationConfig": { + "Image": "quantization-image-uri", + "OverrideEnvironment": {"ENV_VAR": "value"}, + } + } + ], + OutputConfig={ + "S3OutputLocation": "s3://my-bucket/output", + "KmsKeyId": "arn:aws:kms:us-west-2:123456789012:key/my-key-id", + }, + StoppingCondition={"MaxRuntimeInSeconds": 3600}, + Tags=[ + {"Key": "Project", "Value": "my-project"}, + {"Key": "Environment", "Value": "production"}, + ], + VpcConfig={ + "SecurityGroupIds": ["sg-01234567890abcdef", "sg-fedcba9876543210"], + "Subnets": ["subnet-01234567", "subnet-89abcdef"], + }, ) def test_handle_mlflow_input_without_mlflow_model_path(self): @@ -2649,26 +2683,25 @@ def test_optimize_for_hf_with_custom_s3_path( model_builder = ModelBuilder( model="meta-llama/Meta-Llama-3-8B-Instruct", - env_vars={"HUGGING_FACE_HUB_TOKEN": "token"}, + env_vars={"HF_TOKEN": "token"}, model_metadata={ "CUSTOM_MODEL_PATH": "s3://bucket/path/", }, + role_arn="role-arn", + instance_type="ml.g5.2xlarge", ) model_builder.pysdk_model = mock_pysdk_model out_put = model_builder._optimize_for_hf( job_name="job_name-123", - instance_type="ml.g5.2xlarge", - role_arn="role-arn", quantization_config={ "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, }, output_path="s3://bucket/code/", ) - print(out_put) - + self.assertEqual(model_builder.env_vars["HF_TOKEN"], "token") self.assertEqual(model_builder.role_arn, "role-arn") self.assertEqual(model_builder.instance_type, "ml.g5.2xlarge") self.assertEqual(model_builder.pysdk_model.env["OPTION_QUANTIZE"], "awq") @@ -2715,14 +2748,14 @@ def test_optimize_for_hf_without_custom_s3_path( model_builder = ModelBuilder( model="meta-llama/Meta-Llama-3-8B-Instruct", env_vars={"HUGGING_FACE_HUB_TOKEN": "token"}, + role_arn="role-arn", + instance_type="ml.g5.2xlarge", ) model_builder.pysdk_model = mock_pysdk_model out_put = model_builder._optimize_for_hf( job_name="job_name-123", - instance_type="ml.g5.2xlarge", - role_arn="role-arn", quantization_config={ "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}, }, diff --git a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py index 712382f068..a8dc6d74f4 100644 --- a/tests/unit/sagemaker/serve/utils/test_optimize_utils.py +++ b/tests/unit/sagemaker/serve/utils/test_optimize_utils.py @@ -28,7 +28,6 @@ _generate_additional_model_data_sources, _generate_channel_name, _extract_optimization_config_and_env, - _normalize_local_model_path, _is_optimized, _custom_speculative_decoding, _is_inferentia_or_trainium, @@ -312,19 +311,6 @@ def test_extract_optimization_config_and_env( ) -@pytest.mark.parametrize( - "my_path, expected_path", - [ - ("local/path/llama/code", "local/path/llama"), - ("local/path/llama/code/", "local/path/llama"), - ("local/path/llama/", "local/path/llama/"), - ("local/path/llama", "local/path/llama"), - ], -) -def test_normalize_local_model_path(my_path, expected_path): - assert _normalize_local_model_path(my_path) == expected_path - - class TestCustomSpeculativeDecodingConfig(unittest.TestCase): @patch("sagemaker.model.Model")