@@ -2910,8 +2910,7 @@ def test_neuron_configurations_throw_errors_for_rule_set(self):
2910
2910
),
2911
2911
)
2912
2912
2913
- def test_trt_configurations_rule_set (self ):
2914
- # Can be quantized
2913
+ def test_trt_configurations_throw_errors_for_rule_se (self ):
2915
2914
expected_compilation_quantization_error_message = """
2916
2915
Optimization cannot be performed for the following reasons:
2917
2916
- Optimizations that use Compilation must be provided with Quantization and vice-versa for GPU instances.
@@ -2931,7 +2930,6 @@ def test_trt_configurations_rule_set(self):
2931
2930
),
2932
2931
)
2933
2932
2934
- # Can be compiled
2935
2933
expected_compilation_quantization_error_message = """
2936
2934
Optimization cannot be performed for the following reasons:
2937
2935
- Optimizations that use Compilation must be provided with Quantization and vice-versa for GPU instances.
@@ -2949,6 +2947,18 @@ def test_trt_configurations_rule_set(self):
2949
2947
),
2950
2948
)
2951
2949
2950
+ def test_trt_configurations_rule_set (self ):
2951
+ # Can be compiled with quantization
2952
+ _validate_optimization_configuration (
2953
+ instance_type = "ml.g5.24xlarge" ,
2954
+ quantization_config = {
2955
+ "OverrideEnvironment" : {"OPTION_QUANTIZE" : "awq" },
2956
+ },
2957
+ sharding_config = None ,
2958
+ speculative_decoding_config = None ,
2959
+ compilation_config = {"key" : "value" },
2960
+ ),
2961
+
2952
2962
def test_vllm_configurations_rule_set (self ):
2953
2963
# Can use speculative decoding
2954
2964
_validate_optimization_configuration (
0 commit comments