@@ -2386,11 +2386,11 @@ def test_optimize(
2386
2386
builder .pysdk_model = pysdk_model
2387
2387
2388
2388
job_name = "my-optimization-job"
2389
- instance_type = "ml.inf2.xlarge "
2389
+ instance_type = "ml.g5.24xlarge "
2390
2390
output_path = "s3://my-bucket/output"
2391
2391
quantization_config = {
2392
2392
"Image" : "quantization-image-uri" ,
2393
- "OverrideEnvironment" : {"ENV_VAR " : "value " },
2393
+ "OverrideEnvironment" : {"OPTION_QUANTIZE " : "awq " },
2394
2394
}
2395
2395
env_vars = {"Var1" : "value" , "Var2" : "value" }
2396
2396
kms_key = "arn:aws:kms:us-west-2:123456789012:key/my-key-id"
@@ -2428,15 +2428,15 @@ def test_optimize(
2428
2428
mock_send_telemetry .assert_called_once ()
2429
2429
mock_sagemaker_session .sagemaker_client .create_optimization_job .assert_called_once_with (
2430
2430
OptimizationJobName = "my-optimization-job" ,
2431
- DeploymentInstanceType = "ml.inf2.xlarge " ,
2431
+ DeploymentInstanceType = "ml.g5.24xlarge " ,
2432
2432
RoleArn = "arn:aws:iam::123456789012:role/SageMakerRole" ,
2433
2433
OptimizationEnvironment = {"Var1" : "value" , "Var2" : "value" },
2434
2434
ModelSource = {"S3" : {"S3Uri" : "s3://uri" }},
2435
2435
OptimizationConfigs = [
2436
2436
{
2437
2437
"ModelQuantizationConfig" : {
2438
2438
"Image" : "quantization-image-uri" ,
2439
- "OverrideEnvironment" : {"ENV_VAR " : "value " },
2439
+ "OverrideEnvironment" : {"OPTION_QUANTIZE " : "awq " },
2440
2440
}
2441
2441
}
2442
2442
],
@@ -2650,7 +2650,7 @@ def test_optimize_local_mode(self, mock_get_serve_setting):
2650
2650
"Model optimization is only supported in Sagemaker Endpoint Mode." ,
2651
2651
lambda : model_builder .optimize (
2652
2652
instance_type = "ml.g5.24xlarge" ,
2653
- quantization_config = {"OverrideEnvironment" : {"OPTION_QUANTIZE" : "awq" }}
2653
+ quantization_config = {"OverrideEnvironment" : {"OPTION_QUANTIZE" : "awq" }},
2654
2654
),
2655
2655
)
2656
2656
@@ -2842,16 +2842,22 @@ def test_corner_cases_throw_errors(self):
2842
2842
ValueError ,
2843
2843
"Optimizations that uses None instance type are not currently supported" ,
2844
2844
lambda : _validate_optimization_configuration (
2845
- sharding_config = {"OverrideEnvironment " : { "OPTION_QUANTIZE" : "awq" } },
2845
+ sharding_config = {"key " : "value" },
2846
2846
instance_type = None ,
2847
2847
quantization_config = None ,
2848
2848
speculative_decoding_config = None ,
2849
2849
compilation_config = None ,
2850
2850
),
2851
2851
)
2852
+
2853
+ expected_missing_optimization_configs_error_message = """
2854
+ Optimization cannot be performed for the following reasons:
2855
+ - Optimizations for TRT that use no optimization configurations are not currently supported on GPU instances
2856
+ - Optimizations for vLLM that use no optimization configurations are not currently supported on GPU instances
2857
+ """
2852
2858
self .assertRaisesRegex (
2853
2859
ValueError ,
2854
- "Optimizations are not currently supported without optimization configurations." ,
2860
+ textwrap . dedent ( expected_missing_optimization_configs_error_message ) ,
2855
2861
lambda : _validate_optimization_configuration (
2856
2862
instance_type = "ml.g5.24xlarge" ,
2857
2863
quantization_config = None ,
@@ -2881,11 +2887,39 @@ def test_trt_and_vllm_configurations_throw_errors_for_rule_set(self):
2881
2887
),
2882
2888
)
2883
2889
2884
- @patch .object (ModelBuilder , "_get_serve_setting" , autospec = True )
2885
- def test_neuron_configurations_throw_errors_for_rule_set (self , mock_get_serve_setting ):
2886
- pass
2890
+ def test_neuron_configurations_throw_errors_for_rule_set (self ):
2891
+ self .assertRaisesRegex (
2892
+ ValueError ,
2893
+ (
2894
+ "Optimizations for Neuron that use Speculative Decoding "
2895
+ "are not currently supported on Neuron instances"
2896
+ ),
2897
+ lambda : _validate_optimization_configuration (
2898
+ instance_type = "ml.inf2.xlarge" ,
2899
+ quantization_config = None ,
2900
+ speculative_decoding_config = {"key" : "value" },
2901
+ compilation_config = None ,
2902
+ sharding_config = None ,
2903
+ ),
2904
+ )
2905
+
2906
+ self .assertRaisesRegex (
2907
+ ValueError ,
2908
+ (
2909
+ "Optimizations for Neuron that use Sharding "
2910
+ "are not currently supported on Neuron instances"
2911
+ ),
2912
+ lambda : _validate_optimization_configuration (
2913
+ instance_type = "ml.inf2.xlarge" ,
2914
+ quantization_config = None ,
2915
+ speculative_decoding_config = None ,
2916
+ compilation_config = None ,
2917
+ sharding_config = {"key" : "value" },
2918
+ ),
2919
+ )
2887
2920
2888
2921
def test_trt_configurations_rule_set (self ):
2922
+ # Can be quantized
2889
2923
_validate_optimization_configuration (
2890
2924
instance_type = "ml.g5.24xlarge" ,
2891
2925
quantization_config = {
@@ -2896,6 +2930,51 @@ def test_trt_configurations_rule_set(self):
2896
2930
compilation_config = None ,
2897
2931
)
2898
2932
2899
- @patch .object (ModelBuilder , "_get_serve_setting" , autospec = True )
2900
- def test_vllm_configurations_rule_set (self , mock_get_serve_setting ):
2901
- pass
2933
+ # Can be compiled
2934
+ _validate_optimization_configuration (
2935
+ instance_type = "ml.g5.24xlarge" ,
2936
+ quantization_config = None ,
2937
+ sharding_config = None ,
2938
+ speculative_decoding_config = None ,
2939
+ compilation_config = {"key" : "value" },
2940
+ )
2941
+
2942
+ def test_vllm_configurations_rule_set (self ):
2943
+ # Can be quantized
2944
+ _validate_optimization_configuration (
2945
+ instance_type = "ml.g5.24xlarge" ,
2946
+ quantization_config = {
2947
+ "OverrideEnvironment" : {"OPTION_QUANTIZE" : "awq" },
2948
+ },
2949
+ sharding_config = None ,
2950
+ speculative_decoding_config = None ,
2951
+ compilation_config = None ,
2952
+ )
2953
+
2954
+ # Can use speculative decoding
2955
+ _validate_optimization_configuration (
2956
+ instance_type = "ml.g5.24xlarge" ,
2957
+ quantization_config = None ,
2958
+ sharding_config = None ,
2959
+ speculative_decoding_config = {"key" : "value" },
2960
+ compilation_config = None ,
2961
+ )
2962
+
2963
+ # Can be sharded
2964
+ _validate_optimization_configuration (
2965
+ instance_type = "ml.g5.24xlarge" ,
2966
+ quantization_config = None ,
2967
+ sharding_config = {"key" : "value" },
2968
+ speculative_decoding_config = None ,
2969
+ compilation_config = None ,
2970
+ )
2971
+
2972
+ def test_neuron_configurations_rule_set (self ):
2973
+ # Can be compiled
2974
+ _validate_optimization_configuration (
2975
+ instance_type = "ml.inf2.xlarge" ,
2976
+ quantization_config = None ,
2977
+ sharding_config = None ,
2978
+ speculative_decoding_config = None ,
2979
+ compilation_config = {"key" : "value" },
2980
+ )
0 commit comments