@@ -2683,7 +2683,7 @@ def test_optimize_exclusive_sharding(self, mock_get_serve_setting):
2683
2683
expected_error_message = """
2684
2684
Optimization cannot be performed for the following reasons:
2685
2685
- Optimizations that use Sharding are not supported for GPU instances.
2686
- - Optimizations that use Compilation and Quantization:awq are not supported for GPU instances.
2686
+ - Optimizations that use Compilation are not supported for GPU instances.
2687
2687
"""
2688
2688
2689
2689
self .assertRaisesRegex (
@@ -2866,10 +2866,28 @@ def test_corner_cases_throw_errors(self):
2866
2866
)
2867
2867
2868
2868
def test_trt_and_vllm_configurations_throw_errors_for_rule_set (self ):
2869
+ expected_compilation_quantization_error_message = """
2870
+ Optimization cannot be performed for the following reasons:
2871
+ - Optimizations that use Quantization must be provided with Compilation for GPU instances.
2872
+ - Optimizations that use Quantization:smoothquant are not supported for GPU instances.
2873
+ """
2874
+ self .assertRaisesRegex (
2875
+ ValueError ,
2876
+ textwrap .dedent (expected_compilation_quantization_error_message ),
2877
+ lambda : _validate_optimization_configuration (
2878
+ instance_type = "ml.g5.24xlarge" ,
2879
+ quantization_config = {
2880
+ "OverrideEnvironment" : {"OPTION_QUANTIZE" : "smoothquant" },
2881
+ },
2882
+ sharding_config = None ,
2883
+ speculative_decoding_config = None ,
2884
+ compilation_config = None ,
2885
+ ),
2886
+ )
2887
+
2869
2888
expected_quantization_error_message = """
2870
2889
Optimization cannot be performed for the following reasons:
2871
2890
- Optimizations that use Quantization:test are not supported for GPU instances.
2872
- - Optimizations that use Quantization:test are not supported for GPU instances.
2873
2891
"""
2874
2892
self .assertRaisesRegex (
2875
2893
ValueError ,
@@ -2910,43 +2928,6 @@ def test_neuron_configurations_throw_errors_for_rule_set(self):
2910
2928
),
2911
2929
)
2912
2930
2913
- def test_trt_configurations_throw_errors_for_rule_se (self ):
2914
- expected_compilation_quantization_error_message = """
2915
- Optimization cannot be performed for the following reasons:
2916
- - Optimizations that use Compilation must be provided with Quantization and vice-versa for GPU instances.
2917
- - Optimizations that use Quantization:awq are not supported for GPU instances.
2918
- """
2919
- self .assertRaisesRegex (
2920
- ValueError ,
2921
- textwrap .dedent (expected_compilation_quantization_error_message ),
2922
- lambda : _validate_optimization_configuration (
2923
- instance_type = "ml.g5.24xlarge" ,
2924
- quantization_config = {
2925
- "OverrideEnvironment" : {"OPTION_QUANTIZE" : "awq" },
2926
- },
2927
- sharding_config = None ,
2928
- speculative_decoding_config = None ,
2929
- compilation_config = None ,
2930
- ),
2931
- )
2932
-
2933
- expected_compilation_quantization_error_message = """
2934
- Optimization cannot be performed for the following reasons:
2935
- - Optimizations that use Compilation must be provided with Quantization and vice-versa for GPU instances.
2936
- - Optimizations that use Compilation are not supported for GPU instances.
2937
- """
2938
- self .assertRaisesRegex (
2939
- ValueError ,
2940
- textwrap .dedent (expected_compilation_quantization_error_message ),
2941
- lambda : _validate_optimization_configuration (
2942
- instance_type = "ml.g5.24xlarge" ,
2943
- quantization_config = None ,
2944
- sharding_config = None ,
2945
- speculative_decoding_config = None ,
2946
- compilation_config = {"key" : "value" },
2947
- ),
2948
- )
2949
-
2950
2931
def test_trt_configurations_rule_set (self ):
2951
2932
# Can be compiled with quantization
2952
2933
_validate_optimization_configuration (
@@ -2959,6 +2940,15 @@ def test_trt_configurations_rule_set(self):
2959
2940
compilation_config = {"key" : "value" },
2960
2941
),
2961
2942
2943
+ # Can be just compiled
2944
+ _validate_optimization_configuration (
2945
+ instance_type = "ml.g5.24xlarge" ,
2946
+ quantization_config = None ,
2947
+ sharding_config = None ,
2948
+ speculative_decoding_config = None ,
2949
+ compilation_config = {"key" : "value" },
2950
+ )
2951
+
2962
2952
def test_vllm_configurations_rule_set (self ):
2963
2953
# Can use speculative decoding
2964
2954
_validate_optimization_configuration (
@@ -2969,6 +2959,17 @@ def test_vllm_configurations_rule_set(self):
2969
2959
compilation_config = None ,
2970
2960
)
2971
2961
2962
+ # Cab be quantized
2963
+ _validate_optimization_configuration (
2964
+ instance_type = "ml.g5.24xlarge" ,
2965
+ quantization_config = {
2966
+ "OverrideEnvironment" : {"OPTION_QUANTIZE" : "awq" },
2967
+ },
2968
+ sharding_config = None ,
2969
+ speculative_decoding_config = None ,
2970
+ compilation_config = None ,
2971
+ )
2972
+
2972
2973
# Can be sharded
2973
2974
_validate_optimization_configuration (
2974
2975
instance_type = "ml.g5.24xlarge" ,
0 commit comments