Add optimize to ModelBuilder (#1468)

grenmester · Jacky Lee · web-flow · commit 015120998d79 · 2024-06-11T09:54:53.000-07:00
* Add optimize to ModelBuilder

* Add polling for job completion

* fix UTs

---------

Co-authored-by: Jacky Lee &lt;drjacky@amazon.com&gt;
diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py
@@ -62,6 +62,10 @@
 from sagemaker.serve.utils import task
 from sagemaker.serve.utils.exceptions import TaskNotFoundException
 from sagemaker.serve.utils.lineage_utils import _maintain_lineage_tracking_for_mlflow_model
+from sagemaker.serve.utils.optimize_utils import (
+    _is_compatible_with_compilation,
+    _poll_optimization_job,
+)
 from sagemaker.serve.utils.predictors import _get_local_mode_predictor
 from sagemaker.serve.utils.hardware_detector import (
     _get_gpu_info,
@@ -83,6 +87,7 @@
 from sagemaker.serve.validations.check_image_and_hardware_type import (
     validate_image_uri_and_hardware,
 )
+from sagemaker.utils import Tags
 from sagemaker.workflow.entities import PipelineVariable
 from sagemaker.huggingface.llm_utils import get_huggingface_model_metadata
 
@@ -804,8 +809,15 @@ def save(
         This function is available for models served by DJL serving.
 
         Args:
-            save_path (Optional[str]): The path where you want to save resources.
-            s3_path (Optional[str]): The path where you want to upload resources.
+            save_path (Optional[str]): The path where you want to save resources. Defaults to
+                ``None``.
+            s3_path (Optional[str]): The path where you want to upload resources. Defaults to
+                ``None``.
+            sagemaker_session (Optional[Session]): Session object which manages interactions
+                with Amazon SageMaker APIs and any other AWS services needed. If not specified, the
+                function creates one using the default AWS configuration chain. Defaults to
+                ``None``.
+            role_arn (Optional[str]): The IAM role arn. Defaults to ``None``.
         """
         self.sagemaker_session = sagemaker_session or Session()
 
@@ -915,3 +927,129 @@ def _try_fetch_gpu_info(self):
             raise ValueError(
                 f"Unable to determine single GPU size for instance: [{self.instance_type}]"
             )
+
+    def optimize(self, *args, **kwargs) -> Type[Model]:
+        """Runs a model optimization job.
+
+        Args:
+            instance_type (str): Target deployment instance type that the model is optimized for.
+            output_path (str): Specifies where to store the compiled/quantized model.
+            role (Optional[str]): Execution role. Defaults to ``None``.
+            tags (Optional[Tags]): Tags for labeling a model optimization job. Defaults to ``None``.
+            job_name (Optional[str]): The name of the model optimization job. Defaults to ``None``.
+            quantization_config (Optional[Dict]): Quantization configuration. Defaults to ``None``.
+            compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``.
+            env_vars (Optional[Dict]): Additional environment variables to run the optimization
+                container. Defaults to ``None``.
+            vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``.
+            kms_key (Optional[str]): KMS key ARN used to encrypt the model artifacts when uploading
+                to S3. Defaults to ``None``.
+            max_runtime_in_sec (Optional[int]): Maximum job execution time in seconds. Defaults to
+                ``None``.
+            sagemaker_session (Optional[Session]): Session object which manages interactions
+                with Amazon SageMaker APIs and any other AWS services needed. If not specified, the
+                function creates one using the default AWS configuration chain.
+
+        Returns:
+            Type[Model]: A deployable ``Model`` object.
+        """
+        # need to get telemetry_opt_out info before telemetry decorator is called
+        self.serve_settings = self._get_serve_setting()
+
+        return self._model_builder_optimize_wrapper(*args, **kwargs)
+
+    @_capture_telemetry("optimize")
+    def _model_builder_optimize_wrapper(
+        self,
+        instance_type: str,
+        output_path: str,
+        role: Optional[str] = None,
+        tags: Optional[Tags] = None,
+        job_name: Optional[str] = None,
+        quantization_config: Optional[Dict] = None,
+        compilation_config: Optional[Dict] = None,
+        env_vars: Optional[Dict] = None,
+        vpc_config: Optional[Dict] = None,
+        kms_key: Optional[str] = None,
+        max_runtime_in_sec: Optional[int] = None,
+        sagemaker_session: Optional[Session] = None,
+    ) -> Type[Model]:
+        """Runs a model optimization job.
+
+        Args:
+            instance_type (str): Target deployment instance type that the model is optimized for.
+            output_path (str): Specifies where to store the compiled/quantized model.
+            role (Optional[str]): Execution role. Defaults to ``None``.
+            tags (Optional[Tags]): Tags for labeling a model optimization job. Defaults to ``None``.
+            job_name (Optional[str]): The name of the model optimization job. Defaults to ``None``.
+            quantization_config (Optional[Dict]): Quantization configuration. Defaults to ``None``.
+            compilation_config (Optional[Dict]): Compilation configuration. Defaults to ``None``.
+            env_vars (Optional[Dict]): Additional environment variables to run the optimization
+                container. Defaults to ``None``.
+            vpc_config (Optional[Dict]): The VpcConfig set on the model. Defaults to ``None``.
+            kms_key (Optional[str]): KMS key ARN used to encrypt the model artifacts when uploading
+                to S3. Defaults to ``None``.
+            max_runtime_in_sec (Optional[int]): Maximum job execution time in seconds. Defaults to
+                ``None``.
+            sagemaker_session (Optional[Session]): Session object which manages interactions
+                with Amazon SageMaker APIs and any other AWS services needed. If not specified, the
+                function creates one using the default AWS configuration chain.
+
+        Returns:
+            Type[Model]: A deployable ``Model`` object.
+        """
+        self.sagemaker_session = sagemaker_session or self.sagemaker_session or Session()
+
+        # TODO: inject actual model source location based on different scenarios
+        model_source = {"S3": {"S3Uri": self.model_path, "ModelAccessConfig": {"AcceptEula": True}}}
+
+        optimization_configs = []
+        if quantization_config:
+            optimization_configs.append({"ModelQuantizationConfig": quantization_config})
+        if compilation_config:
+            if _is_compatible_with_compilation(instance_type):
+                optimization_configs.append({"ModelCompilationConfig": compilation_config})
+            else:
+                logger.warning(
+                    "Model compilation is currently only supported for Inferentia and Trainium"
+                    "instances, ignoring `compilation_config'."
+                )
+
+        output_config = {"S3OutputLocation": output_path}
+        if kms_key:
+            output_config["KmsKeyId"] = kms_key
+
+        job_name = job_name or f"modelbuilderjob-{uuid.uuid4().hex}"
+        create_optimization_job_args = {
+            "OptimizationJobName": job_name,
+            "ModelSource": model_source,
+            "DeploymentInstanceType": instance_type,
+            "OptimizationConfigs": optimization_configs,
+            "OutputConfig": output_config,
+            "RoleArn": role or self.role_arn,
+        }
+
+        if env_vars:
+            create_optimization_job_args["OptimizationEnvironment"] = env_vars
+
+        if max_runtime_in_sec:
+            create_optimization_job_args["StoppingCondition"] = {
+                "MaxRuntimeInSeconds": max_runtime_in_sec
+            }
+
+        # TODO: tag injection if it is a JumpStart model
+        if tags:
+            create_optimization_job_args["Tags"] = tags
+
+        if vpc_config:
+            create_optimization_job_args["VpcConfig"] = vpc_config
+
+        response = self.sagemaker_session.sagemaker_client.create_optimization_job(
+            **create_optimization_job_args
+        )
+
+        if not _poll_optimization_job(job_name, self.sagemaker_session):
+            raise Exception("Optimization job timed out.")
+
+        # TODO: return model created by optimization job
+        return response
diff --git a/src/sagemaker/serve/utils/optimize_utils.py b/src/sagemaker/serve/utils/optimize_utils.py
@@ -0,0 +1,58 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Holds the util functions used for the optimize function"""
+from __future__ import absolute_import
+
+import time
+import logging
+
+from sagemaker import Session
+
+# TODO: determine how long optimization jobs take
+OPTIMIZE_POLLER_MAX_TIMEOUT_SECS = 300
+OPTIMIZE_POLLER_INTERVAL_SECS = 30
+
+logger = logging.getLogger(__name__)
+
+
+def _is_compatible_with_compilation(instance_type: str) -> bool:
+    """Checks whether an instance is compatible with compilation.
+
+    Args:
+        instance_type (str): The instance type used for the compilation job.
+
+    Returns:
+        bool: Whether the given instance type is compatible with compilation.
+    """
+    return instance_type.startswith("ml.inf") or instance_type.startswith("ml.trn")
+
+
+def _poll_optimization_job(job_name: str, sagemaker_session: Session) -> bool:
+    """Polls optimization job status until success.
+
+    Args:
+        job_name (str): The name of the optimization job.
+        sagemaker_session (Session): Session object which manages interactions
+            with Amazon SageMaker APIs and any other AWS services needed.
+
+    Returns:
+        bool: Whether the optimization job was successful.
+    """
+    logger.info("Polling status of optimization job %s", job_name)
+    start_time = time.time()
+    while time.time() - start_time < OPTIMIZE_POLLER_MAX_TIMEOUT_SECS:
+        result = sagemaker_session.sagemaker_client.describe_optimization_job(job_name)
+        # TODO: use correct condition to determine whether optimization job is complete
+        if result is not None:
+            return result
+        time.sleep(OPTIMIZE_POLLER_INTERVAL_SECS)
diff --git a/src/sagemaker/serve/utils/telemetry_logger.py b/src/sagemaker/serve/utils/telemetry_logger.py
@@ -80,15 +80,22 @@ def wrapper(self, *args, **kwargs):
             response = None
             caught_ex = None
 
-            image_uri_tail = self.image_uri.split("/")[1]
-            image_uri_option = _get_image_uri_option(self.image_uri, self._is_custom_image_uri)
-            extra = (
-                f"{func_name}"
-                f"&x-modelServer={MODEL_SERVER_TO_CODE[str(self.model_server)]}"
-                f"&x-imageTag={image_uri_tail}"
-                f"&x-sdkVersion={SDK_VERSION}"
-                f"&x-defaultImageUsage={image_uri_option}"
-            )
+            extra = f"{func_name}"
+
+            if self.model_server:
+                extra += f"&x-modelServer={MODEL_SERVER_TO_CODE[str(self.model_server)]}"
+
+            if self.image_uri:
+                image_uri_tail = self.image_uri.split("/")[1]
+                image_uri_option = _get_image_uri_option(self.image_uri, self._is_custom_image_uri)
+
+            if self.image_uri:
+                extra += f"&x-imageTag={image_uri_tail}"
+
+            extra += f"&x-sdkVersion={SDK_VERSION}"
+
+            if self.image_uri:
+                extra += f"&x-defaultImageUsage={image_uri_option}"
 
             if self.model_server == ModelServer.DJL_SERVING or self.model_server == ModelServer.TGI:
                 extra += f"&x-modelName={self.model}"
diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py
@@ -44,7 +44,7 @@
 
 mock_image_uri = "abcd/efghijk"
 mock_1p_dlc_image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com"
-mock_role_arn = "sample role arn"
+mock_role_arn = "arn:aws:iam::123456789012:role/SageMakerRole"
 mock_s3_model_data_url = "sample s3 data url"
 mock_secret_key = "mock_secret_key"
 mock_instance_type = "mock instance type"
@@ -2257,3 +2257,84 @@ def test_build_tensorflow_serving_non_mlflow_case(
             mock_role_arn,
             mock_session,
         )
+
+    @patch.object(ModelBuilder, "_get_serve_setting", autospec=True)
+    @patch("sagemaker.serve.utils.telemetry_logger._send_telemetry")
+    def test_optimize(self, mock_send_telemetry, mock_get_serve_setting):
+        mock_sagemaker_session = Mock()
+
+        mock_settings = Mock()
+        mock_settings.telemetry_opt_out = False
+        mock_get_serve_setting.return_value = mock_settings
+
+        builder = ModelBuilder(
+            model_path=MODEL_PATH,
+            schema_builder=schema_builder,
+            model=mock_fw_model,
+            sagemaker_session=mock_sagemaker_session,
+        )
+
+        job_name = "my-optimization-job"
+        instance_type = "ml.inf1.xlarge"
+        output_path = "s3://my-bucket/output"
+        quantization_config = {
+            "Image": "quantization-image-uri",
+            "OverrideEnvironment": {"ENV_VAR": "value"},
+        }
+        compilation_config = {
+            "Image": "compilation-image-uri",
+            "OverrideEnvironment": {"ENV_VAR": "value"},
+        }
+        env_vars = {"Var1": "value", "Var2": "value"}
+        kms_key = "arn:aws:kms:us-west-2:123456789012:key/my-key-id"
+        max_runtime_in_sec = 3600
+        tags = [
+            {"Key": "Project", "Value": "my-project"},
+            {"Key": "Environment", "Value": "production"},
+        ]
+        vpc_config = {
+            "SecurityGroupIds": ["sg-01234567890abcdef", "sg-fedcba9876543210"],
+            "Subnets": ["subnet-01234567", "subnet-89abcdef"],
+        }
+
+        expected_create_optimization_job_args = {
+            "ModelSource": {"S3": {"S3Uri": MODEL_PATH, "ModelAccessConfig": {"AcceptEula": True}}},
+            "DeploymentInstanceType": instance_type,
+            "OptimizationEnvironment": env_vars,
+            "OptimizationConfigs": [
+                {"ModelQuantizationConfig": quantization_config},
+                {"ModelCompilationConfig": compilation_config},
+            ],
+            "OutputConfig": {"S3OutputLocation": output_path, "KmsKeyId": kms_key},
+            "RoleArn": mock_role_arn,
+            "OptimizationJobName": job_name,
+            "StoppingCondition": {"MaxRuntimeInSeconds": max_runtime_in_sec},
+            "Tags": [
+                {"Key": "Project", "Value": "my-project"},
+                {"Key": "Environment", "Value": "production"},
+            ],
+            "VpcConfig": vpc_config,
+        }
+
+        mock_sagemaker_session.sagemaker_client.create_optimization_job.return_value = {
+            "OptimizationJobArn": "arn:aws:sagemaker:us-west-2:123456789012:optimization-job/my-optimization-job"
+        }
+
+        builder.optimize(
+            instance_type=instance_type,
+            output_path=output_path,
+            role=mock_role_arn,
+            job_name=job_name,
+            quantization_config=quantization_config,
+            compilation_config=compilation_config,
+            env_vars=env_vars,
+            kms_key=kms_key,
+            max_runtime_in_sec=max_runtime_in_sec,
+            tags=tags,
+            vpc_config=vpc_config,
+        )
+
+        mock_send_telemetry.assert_called_once()
+        mock_sagemaker_session.sagemaker_client.create_optimization_job.assert_called_once_with(
+            **expected_create_optimization_job_args
+        )