Disable network isolation if using sharded models.

Joseph Zhang · gwang111 · commit 7e8e23727c3e · 2024-11-18T21:45:24.000Z
diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py
@@ -1607,6 +1607,12 @@ def deploy(
             )
             endpoint_type = EndpointType.INFERENCE_COMPONENT_BASED
 
+        if self._is_sharded_model and self._enable_network_isolation:
+            raise ValueError(
+                "EnableNetworkIsolation cannot be set to True since SageMaker Fast Model "
+                "Loading of model requires network access."
+            )
+
         # Support multiple models on same endpoint
         if endpoint_type == EndpointType.INFERENCE_COMPONENT_BASED:
             if endpoint_name:
diff --git a/src/sagemaker/serve/builder/jumpstart_builder.py b/src/sagemaker/serve/builder/jumpstart_builder.py
@@ -795,6 +795,14 @@ def _optimize_for_jumpstart(
         optimization_env_vars = _update_environment_variables(optimization_env_vars, override_env)
         if optimization_env_vars:
             self.pysdk_model.env.update(optimization_env_vars)
+
+        if sharding_config and self.pysdk_model._enable_network_isolation:
+            logger.warning(
+                "EnableNetworkIsolation cannot be set to True since SageMaker Fast Model "
+                "Loading of model requires network access. Setting it to False."
+            )
+            self.pysdk_model._enable_network_isolation = False
+
         if quantization_config or sharding_config or is_compilation:
             return create_optimization_job_args
         return None
diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py
@@ -1276,8 +1276,8 @@ def _model_builder_optimize_wrapper(
         ):
             raise ValueError(
                 (
-                    "OPTION_TENSOR_PARALLEL_DEGREE is required "
-                    "environment variable with Sharding config."
+                    "OPTION_TENSOR_PARALLEL_DEGREE is a required "
+                    "environment variable with sharding config."
                 )
             )
 
diff --git a/src/sagemaker/serve/validations/optimization.py b/src/sagemaker/serve/validations/optimization.py
@@ -211,14 +211,15 @@ def _validate_optimization_configuration(
                             f"Optimizations that use {trt_compare_error} for GPU instances."
                         )
                     if str(trt_compare_error) == str(vllm_compare_error):
-                        joint_error_msg = f"""
-                        Optimization cannot be performed for the following reasons:
-                        - Optimizations that use {trt_compare_error} are not supported for GPU instances.
-                        """
-                    else:
-                        joint_error_msg = f"""
-                        Optimization cannot be performed for the following reasons:
-                        - Optimizations that use {trt_compare_error} are not supported for GPU instances.
-                        - Optimizations that use {vllm_compare_error} are not supported for GPU instances.
-                        """
+                        raise ValueError(
+                            (
+                                f"Optimizations that use {trt_compare_error} "
+                                "are not supported for GPU instances."
+                            )
+                        )
+                    joint_error_msg = f"""
+                    Optimization cannot be performed for the following reasons:
+                    - Optimizations that use {trt_compare_error} are not supported for GPU instances.
+                    - Optimizations that use {vllm_compare_error} are not supported for GPU instances.
+                    """
                     raise ValueError(textwrap.dedent(joint_error_msg))
diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py
@@ -12,7 +12,6 @@
 # language governing permissions and limitations under the License.
 from __future__ import absolute_import
 
-import textwrap
 from unittest.mock import MagicMock, patch, Mock, mock_open
 
 import unittest
@@ -2701,7 +2700,7 @@ def test_optimize_exclusive_sharding_args(self, mock_get_serve_setting):
 
         self.assertRaisesRegex(
             ValueError,
-            "OPTION_TENSOR_PARALLEL_DEGREE is required environment variable with Sharding config.",
+            "OPTION_TENSOR_PARALLEL_DEGREE is a required environment variable with sharding config.",
             lambda: model_builder.optimize(
                 instance_type="ml.g5.24xlarge",
                 sharding_config={"OverrideEnvironment": {"OPTION_QUANTIZE": "awq"}},
@@ -2876,13 +2875,9 @@ def test_trt_and_vllm_configurations_throw_errors_for_rule_set(self):
         )
 
         # Invalid quantization technique
-        expected_quantization_error_message = """
-        Optimization cannot be performed for the following reasons:
-        - Optimizations that use Quantization:test are not supported for GPU instances.
-        """
         self.assertRaisesRegex(
             ValueError,
-            textwrap.dedent(expected_quantization_error_message),
+            "Optimizations that use Quantization:test are not supported for GPU instances.",
             lambda: _validate_optimization_configuration(
                 instance_type="ml.g5.24xlarge",
                 quantization_config={

Original file line number	Diff line number	Diff line change
`@@ -1276,8 +1276,8 @@ def _model_builder_optimize_wrapper(`
`1276`	`1276`	`):`
`1277`	`1277`	`raise ValueError(`
`1278`	`1278`	`(`
`1279`		`- "OPTION_TENSOR_PARALLEL_DEGREE is required "`
`1280`		`- "environment variable with Sharding config."`
	`1279`	`+ "OPTION_TENSOR_PARALLEL_DEGREE is a required "`
	`1280`	`+ "environment variable with sharding config."`
`1281`	`1281`	`)`
`1282`	`1282`	`)`
`1283`	`1283`