Emit warning when cpu cores are requested with sharded model deployment.

Joseph Zhang · Joseph Zhang · commit 1bddf727c5f3 · 2024-11-20T16:15:51.000-08:00
diff --git a/src/sagemaker/jumpstart/model.py b/src/sagemaker/jumpstart/model.py
@@ -817,6 +817,20 @@ def deploy(
                 f"{EndpointType.INFERENCE_COMPONENT_BASED} is not supported for Proprietary models."
             )
 
+        # No resources given to deploy() but present 'resources' key in deploy_kwargs means default
+        # JumpStart resource requirements are being used
+        if hasattr(self, "_is_sharded_model") and not resources and deploy_kwargs.resources:
+            if (
+                self._is_sharded_model
+                and deploy_kwargs.resources.num_cpus
+                and deploy_kwargs.resources.num_cpus > 0
+            ):
+                JUMPSTART_LOGGER.warning(
+                    "NumOfCpuCoresRequired should be 0 for the best experience with SageMaker Fast "
+                    "Model Loading. Overriding the requested `num_cpus` to 0."
+                )
+                deploy_kwargs.resources.num_cpus = 0
+
         self.additional_model_data_sources = _add_model_access_configs_to_model_data_sources(
             self.additional_model_data_sources,
             deploy_kwargs.model_access_configs,
diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py
@@ -1613,6 +1613,13 @@ def deploy(
                 "Loading of model requires network access."
             )
 
+        if self._is_sharded_model:
+            if resources.num_cpus and resources.num_cpus > 0:
+                logger.warning(
+                    "NumberOfCpuCoresRequired should be 0 for the best experience with SageMaker "
+                    "Fast Model Loading. Configure by setting `num_cpus` to 0 in `resources`."
+                )
+
         # Support multiple models on same endpoint
         if endpoint_type == EndpointType.INFERENCE_COMPONENT_BASED:
             if endpoint_name:
diff --git a/tests/unit/sagemaker/model/test_model.py b/tests/unit/sagemaker/model/test_model.py
@@ -1482,3 +1482,47 @@ def test_model_source(
     )
 
     assert model_1._get_model_uri() == "s3://tmybuckaet"
+
+
+@patch("sagemaker.utils.repack_model")
+@patch("sagemaker.fw_utils.tar_and_upload_dir")
+def test_deploy_sharded_model_with_cpus_requested_raises_warning(
+    repack_model, tar_and_upload_dir, sagemaker_session
+):
+    framework_model_classes_to_kwargs = {
+        HuggingFaceModel: {
+            "pytorch_version": "1.7.1",
+            "py_version": "py36",
+            "transformers_version": "4.6.1",
+        },
+    }
+
+    sagemaker_session.settings = SessionSettings(include_jumpstart_tags=False)
+
+    source_dir = "s3://blah/blah/blah"
+    for framework_model_class, kwargs in framework_model_classes_to_kwargs.items():
+        test_sharded_model = framework_model_class(
+            entry_point=ENTRY_POINT_INFERENCE,
+            role=ROLE,
+            sagemaker_session=sagemaker_session,
+            model_data=source_dir,
+            **kwargs,
+        )
+        test_sharded_model._is_sharded_model = True
+        from unittest import mock
+
+        with mock.patch("sagemaker.model.logger") as mock_logger:
+            mock_logger.warning.reset_mock()
+            test_sharded_model.deploy(
+                instance_type="ml.m2.xlarge",
+                initial_instance_count=INSTANCE_COUNT,
+                endpoint_type=EndpointType.MODEL_BASED,
+                resources=ResourceRequirements(
+                    requests={"num_accelerators": 1, "memory": 8192, "copies": 1, "num_cpus": 1},
+                    limits={},
+                ),
+            )
+            mock_logger.warning.assert_called_once_with(
+                "NumberOfCpuCoresRequired should be 0 for the best experience with SageMaker "
+                "Fast Model Loading. Configure by setting `num_cpus` to 0 in `resources`."
+            )