From c0902e86fe61e7187cdd2c647715df5fafc5bfa5 Mon Sep 17 00:00:00 2001 From: Loki Date: Wed, 18 Dec 2024 23:49:28 +0000 Subject: [PATCH 1/2] Updating Inference Optimization Validations --- src/sagemaker/serve/builder/model_builder.py | 9 ++++----- tests/unit/sagemaker/serve/builder/test_model_builder.py | 6 +++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index e5e850b885..63d5190839 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -1433,15 +1433,14 @@ def _model_builder_optimize_wrapper( # HF Model ID format = "meta-llama/Meta-Llama-3.1-8B" # JS Model ID format = "meta-textgeneration-llama-3-1-8b" - llama_3_1_keywords = ["llama-3.1", "llama-3-1"] - is_llama_3_1 = self.model and any( - keyword in self.model.lower() for keyword in llama_3_1_keywords + is_llama_3_plus = self.model and bool( + re.search(r"llama-3[\.\-][1-9]\d*", self.model.lower()) ) if is_gpu_instance and self.model and self.is_compiled: - if is_llama_3_1: + if is_llama_3_plus: raise ValueError( - "Compilation is not supported for Llama-3.1 with a GPU instance." + "Compilation is not supported for models greater than Llama-3.0 with a GPU instance." ) if speculative_decoding_config: raise ValueError( diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py index 7355fe4f38..1e20bf1cf3 100644 --- a/tests/unit/sagemaker/serve/builder/test_model_builder.py +++ b/tests/unit/sagemaker/serve/builder/test_model_builder.py @@ -3270,7 +3270,7 @@ def test_optimize_with_gpu_instance_and_llama_3_1_and_compilation( mock_pysdk_model = Mock() mock_pysdk_model.model_data = None - mock_pysdk_model.env = {"HF_MODEL_ID": "meta-llama/Meta-Llama-3-1-8B-Instruct"} + mock_pysdk_model.env = {"HF_MODEL_ID": "meta-llama/Meta-Llama-3-2-8B-Instruct"} sample_input = {"inputs": "dummy prompt", "parameters": {}} @@ -3279,7 +3279,7 @@ def test_optimize_with_gpu_instance_and_llama_3_1_and_compilation( dummy_schema_builder = SchemaBuilder(sample_input, sample_output) model_builder = ModelBuilder( - model="meta-llama/Meta-Llama-3-1-8B-Instruct", + model="meta-llama/Meta-Llama-3-2-8B-Instruct", schema_builder=dummy_schema_builder, env_vars={"HF_TOKEN": "token"}, model_metadata={ @@ -3293,7 +3293,7 @@ def test_optimize_with_gpu_instance_and_llama_3_1_and_compilation( self.assertRaisesRegex( ValueError, - "Compilation is not supported for Llama-3.1 with a GPU instance.", + "Compilation is not supported for models greater than Llama-3.0 with a GPU instance.", lambda: model_builder.optimize( job_name="job_name-123", instance_type="ml.g5.24xlarge", From 48ca11d308012769292a3a30426983aff794738f Mon Sep 17 00:00:00 2001 From: Loki Date: Wed, 18 Dec 2024 23:56:07 +0000 Subject: [PATCH 2/2] Linting --- src/sagemaker/serve/builder/model_builder.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sagemaker/serve/builder/model_builder.py b/src/sagemaker/serve/builder/model_builder.py index 63d5190839..a7a518105c 100644 --- a/src/sagemaker/serve/builder/model_builder.py +++ b/src/sagemaker/serve/builder/model_builder.py @@ -1440,7 +1440,8 @@ def _model_builder_optimize_wrapper( if is_gpu_instance and self.model and self.is_compiled: if is_llama_3_plus: raise ValueError( - "Compilation is not supported for models greater than Llama-3.0 with a GPU instance." + "Compilation is not supported for models greater " + "than Llama-3.0 with a GPU instance." ) if speculative_decoding_config: raise ValueError(