update ruleset

gwang111 · gwang111 · commit 52482c9fd077 · 2024-11-18T21:20:40.000Z
diff --git a/src/sagemaker/serve/validations/optimization.py b/src/sagemaker/serve/validations/optimization.py
@@ -65,8 +65,8 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont
         if rule_set == _OptimizationContainer.TRT:
             is_compiled = optimization_combination.compilation.copy().pop()
             is_quantized = optimization_combination.quantization_technique.copy().pop()
-            if is_compiled and not is_quantized or is_quantized and not is_compiled:
-                raise ValueError(f"Compilation must be provided with Quantization")
+            if is_quantized and not is_compiled:
+                raise ValueError(f"Quantization must be provided with Compilation")
 
 
 TRUTHY_SET = {None, True}
@@ -76,7 +76,7 @@ def validate_against(self, optimization_combination, rule_set: _OptimizationCont
     "optimization_combination": _OptimizationCombination(
         optimization_container=_OptimizationContainer.TRT,
         compilation=TRUTHY_SET,
-        quantization_technique={None, "awq", "fp8", "smooth_quant"},
+        quantization_technique={None, "awq", "fp8", "smoothquant"},
         speculative_decoding=FALSY_SET,
         sharding=FALSY_SET,
     ),
@@ -189,18 +189,23 @@ def _validate_optimization_configuration(
                         optimization_combination, rule_set=_OptimizationContainer.VLLM
                     )
                 )
-                print("fsdafas")
             except ValueError as vllm_compare_error:
-                if "Compilation must be provided with Quantization" in str(trt_compare_error):
+                if "Quantization must be provided with Compilation" in str(trt_compare_error):
                     joint_error_msg = f"""
                     Optimization cannot be performed for the following reasons:
-                    - Optimizations that use {trt_compare_error} and vice-versa for GPU instances.
+                    - Optimizations that use {trt_compare_error} for GPU instances.
                     - Optimizations that use {vllm_compare_error} are not supported for GPU instances.
                     """
                 else:
-                    joint_error_msg = f"""
-                    Optimization cannot be performed for the following reasons:
-                    - Optimizations that use {trt_compare_error} are not supported for GPU instances.
-                    - Optimizations that use {vllm_compare_error} are not supported for GPU instances.
-                    """
+                    if str(trt_compare_error) == str(vllm_compare_error):
+                        joint_error_msg = f"""
+                        Optimization cannot be performed for the following reasons:
+                        - Optimizations that use {trt_compare_error} are not supported for GPU instances.
+                        """
+                    else:
+                        joint_error_msg = f"""
+                        Optimization cannot be performed for the following reasons:
+                        - Optimizations that use {trt_compare_error} are not supported for GPU instances.
+                        - Optimizations that use {vllm_compare_error} are not supported for GPU instances.
+                        """
                 raise ValueError(textwrap.dedent(joint_error_msg))
diff --git a/tests/unit/sagemaker/serve/builder/test_model_builder.py b/tests/unit/sagemaker/serve/builder/test_model_builder.py
@@ -2683,7 +2683,7 @@ def test_optimize_exclusive_sharding(self, mock_get_serve_setting):
         expected_error_message = """
         Optimization cannot be performed for the following reasons:
         - Optimizations that use Sharding are not supported for GPU instances.
-        - Optimizations that use Compilation and Quantization:awq are not supported for GPU instances.
+        - Optimizations that use Compilation are not supported for GPU instances.
         """
 
         self.assertRaisesRegex(
@@ -2866,10 +2866,28 @@ def test_corner_cases_throw_errors(self):
         )
 
     def test_trt_and_vllm_configurations_throw_errors_for_rule_set(self):
+        expected_compilation_quantization_error_message = """
+        Optimization cannot be performed for the following reasons:
+        - Optimizations that use Quantization must be provided with Compilation for GPU instances.
+        - Optimizations that use Quantization:smoothquant are not supported for GPU instances.
+        """
+        self.assertRaisesRegex(
+            ValueError,
+            textwrap.dedent(expected_compilation_quantization_error_message),
+            lambda: _validate_optimization_configuration(
+                instance_type="ml.g5.24xlarge",
+                quantization_config={
+                    "OverrideEnvironment": {"OPTION_QUANTIZE": "smoothquant"},
+                },
+                sharding_config=None,
+                speculative_decoding_config=None,
+                compilation_config=None,
+            ),
+        )
+
         expected_quantization_error_message = """
         Optimization cannot be performed for the following reasons:
         - Optimizations that use Quantization:test are not supported for GPU instances.
-        - Optimizations that use Quantization:test are not supported for GPU instances.
         """
         self.assertRaisesRegex(
             ValueError,
@@ -2910,43 +2928,6 @@ def test_neuron_configurations_throw_errors_for_rule_set(self):
             ),
         )
 
-    def test_trt_configurations_throw_errors_for_rule_se(self):
-        expected_compilation_quantization_error_message = """
-        Optimization cannot be performed for the following reasons:
-        - Optimizations that use Compilation must be provided with Quantization and vice-versa for GPU instances.
-        - Optimizations that use Quantization:awq are not supported for GPU instances.
-        """
-        self.assertRaisesRegex(
-            ValueError,
-            textwrap.dedent(expected_compilation_quantization_error_message),
-            lambda: _validate_optimization_configuration(
-                instance_type="ml.g5.24xlarge",
-                quantization_config={
-                    "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"},
-                },
-                sharding_config=None,
-                speculative_decoding_config=None,
-                compilation_config=None,
-            ),
-        )
-
-        expected_compilation_quantization_error_message = """
-        Optimization cannot be performed for the following reasons:
-        - Optimizations that use Compilation must be provided with Quantization and vice-versa for GPU instances.
-        - Optimizations that use Compilation are not supported for GPU instances.
-        """
-        self.assertRaisesRegex(
-            ValueError,
-            textwrap.dedent(expected_compilation_quantization_error_message),
-            lambda: _validate_optimization_configuration(
-                instance_type="ml.g5.24xlarge",
-                quantization_config=None,
-                sharding_config=None,
-                speculative_decoding_config=None,
-                compilation_config={"key": "value"},
-            ),
-        )
-
     def test_trt_configurations_rule_set(self):
         # Can be compiled with quantization
         _validate_optimization_configuration(
@@ -2959,6 +2940,15 @@ def test_trt_configurations_rule_set(self):
             compilation_config={"key": "value"},
         ),
 
+        # Can be just compiled
+        _validate_optimization_configuration(
+            instance_type="ml.g5.24xlarge",
+            quantization_config=None,
+            sharding_config=None,
+            speculative_decoding_config=None,
+            compilation_config={"key": "value"},
+        )
+
     def test_vllm_configurations_rule_set(self):
         # Can use speculative decoding
         _validate_optimization_configuration(
@@ -2969,6 +2959,17 @@ def test_vllm_configurations_rule_set(self):
             compilation_config=None,
         )
 
+        # Cab be quantized
+        _validate_optimization_configuration(
+            instance_type="ml.g5.24xlarge",
+            quantization_config={
+                "OverrideEnvironment": {"OPTION_QUANTIZE": "awq"},
+            },
+            sharding_config=None,
+            speculative_decoding_config=None,
+            compilation_config=None,
+        )
+
         # Can be sharded
         _validate_optimization_configuration(
             instance_type="ml.g5.24xlarge",