Add sanity check for instance usage classes

ddeidda · ddeidda · commit 553838f547ac · 2021-03-17T16:52:29.000+01:00
This commit adds a validator for usage classes ("spot" or "ondemand") assigned to cluster nodes through the `cluster_type` (in `cluster` section) or `compute_type` (in `queue` section) configuration parameters.

Signed-off-by: ddeidda &lt;ddeidda@amazon.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,10 @@ CHANGELOG
 ------
 **ENHANCEMENTS**
 
-- Enable support for ARM instances in China and GovCloud regions when using Ubuntu 18.04 or Amazon Linux 2. 
+- Enable support for ARM instances in China and GovCloud regions when using Ubuntu 18.04 or Amazon Linux 2.
+- Add validation for `cluster_type` configuration parameter in `cluster` section
+- Add validation for `compute_type` configuration parameter in `queue` section
+ 
 
 **CHANGES**
 
diff --git a/cli/src/pcluster/config/mappings.py b/cli/src/pcluster/config/mappings.py
@@ -58,6 +58,7 @@
 from pcluster.config.validators import (
     architecture_os_validator,
     base_os_validator,
+    cluster_type_validator,
     cluster_validator,
     compute_instance_type_validator,
     compute_resource_validator,
@@ -99,6 +100,7 @@
     intel_hpc_os_validator,
     kms_key_validator,
     maintain_initial_size_validator,
+    queue_compute_type_validator,
     queue_settings_validator,
     queue_validator,
     s3_bucket_uri_validator,
@@ -723,7 +725,7 @@
     "type": QueueJsonSection,
     "key": "queue",
     "default_label": "default",
-    "validators": [queue_validator],
+    "validators": [queue_validator, queue_compute_type_validator],
     "max_resources": 5,
     "params": OrderedDict([
         ("compute_type", {
@@ -1103,6 +1105,7 @@
                 "default": "ondemand",
                 "allowed_values": ["ondemand", "spot"],
                 "cfn_param_mapping": "ClusterType",
+                "validators": [cluster_type_validator],
                 "update_policy": UpdatePolicy.COMPUTE_FLEET_STOP
             }),
             ("spot_price", {
diff --git a/cli/src/pcluster/config/validators.py b/cli/src/pcluster/config/validators.py
@@ -1185,7 +1185,7 @@ def check_unsupported_feature(compute_resource, feature_name, param_key):
 
     instance_types = []
     for compute_resource_label in compute_resource_labels:
-        compute_resource = pcluster_config.get_section("compute_resource", compute_resource_label)
+        compute_resource = pcluster_config.get_section("compute_resource", compute_resource_label.strip())
         if compute_resource:
             instance_type = compute_resource.get_param_value("instance_type")
             if instance_type in instance_types:
@@ -1208,6 +1208,21 @@ def check_unsupported_feature(compute_resource, feature_name, param_key):
     return errors, warnings
 
 
+def queue_compute_type_validator(section_key, section_label, pcluster_config):
+    errors = []
+    warnings = []
+    queue_section = pcluster_config.get_section(section_key, section_label)
+    compute_resource_labels = str(queue_section.get_param_value("compute_resource_settings") or "").split(",")
+
+    for compute_resource_label in compute_resource_labels:
+        # Check that usage class set in queue section is supported by all compute resource instance types
+        compute_resource = pcluster_config.get_section("compute_resource", compute_resource_label.strip())
+        if compute_resource:
+            instance_type = compute_resource.get_param_value("instance_type")
+            check_usage_class(instance_type, queue_section.get_param_value("compute_type"), errors, warnings)
+    return errors, warnings
+
+
 def settings_validator(param_key, param_value, pcluster_config):
     errors = []
     if param_value:
@@ -1601,3 +1616,26 @@ def ebs_volume_throughput_validator(section_key, section_label, pcluster_config)
             )
 
     return errors, warnings
+
+
+def cluster_type_validator(param_key, param_value, pcluster_config):
+    errors = []
+    warnings = []
+
+    scheduler = pcluster_config.get_section("cluster").get_param_value("scheduler")
+    if scheduler != "awsbatch":
+        compute_instance_type = pcluster_config.get_section("cluster").get_param_value("compute_instance_type")
+        check_usage_class(compute_instance_type, param_value, errors, warnings)
+
+    return errors, warnings
+
+
+def check_usage_class(instance_type, usage_class, errors, warnings):
+    supported_usage_classes = InstanceTypeInfo.init_from_instance_type(instance_type).supported_usage_classes()
+
+    if not supported_usage_classes:
+        warnings.append(
+            "Could not check support for usage class '{0}' with instance type '{1}'".format(usage_class, instance_type)
+        )
+    elif usage_class not in supported_usage_classes:
+        errors.append("Usage type '{0}' not supported with instance type '{1}'".format(usage_class, instance_type))
diff --git a/cli/src/pcluster/utils.py b/cli/src/pcluster/utils.py
@@ -1350,3 +1350,12 @@ def supported_architecture(self):
     def is_efa_supported(self):
         """Check whether EFA is supported."""
         return self.instance_type_data.get("NetworkInfo").get("EfaSupported")
+
+    def supported_usage_classes(self):
+        """Return the list supported usage classes."""
+        supported_classes = self.instance_type_data.get("SupportedUsageClasses", [])
+        if "on-demand" in supported_classes:
+            # Replace official AWS with internal naming convention
+            supported_classes.remove("on-demand")
+            supported_classes.append("ondemand")
+        return supported_classes
diff --git a/cli/tests/pcluster/config/test_validators.py b/cli/tests/pcluster/config/test_validators.py
@@ -27,13 +27,16 @@
     FSX_SUPPORTED_ARCHITECTURES_OSES,
     LOGFILE_LOGGER,
     architecture_os_validator,
+    check_usage_class,
+    cluster_type_validator,
     compute_resource_validator,
     disable_hyperthreading_architecture_validator,
     efa_gdr_validator,
     efa_os_arch_validator,
     fsx_ignored_parameters_validator,
     instances_architecture_compatibility_validator,
     intel_hpc_architecture_validator,
+    queue_compute_type_validator,
     queue_validator,
     settings_validator,
 )
@@ -2808,3 +2811,96 @@ def test_efa_os_arch_validator(mocker, cluster_dict, architecture, expected_erro
 def test_ebs_volume_throughput_validator(mocker, section_dict, expected_message):
     config_parser_dict = {"cluster default": {"ebs_settings": "default"}, "ebs default": section_dict}
     utils.assert_param_validator(mocker, config_parser_dict, expected_message)
+
+
+@pytest.mark.parametrize(
+    "usage_class, supported_usage_classes, expected_error_message, expected_warning_message",
+    [
+        ("ondemand", ["ondemand", "spot"], None, None),
+        ("spot", ["ondemand", "spot"], None, None),
+        ("ondemand", ["ondemand"], None, None),
+        ("spot", ["spot"], None, None),
+        ("spot", [], None, "Could not check support for usage class 'spot' with instance type 'instance-type'"),
+        ("ondemand", [], None, "Could not check support for usage class 'ondemand' with instance type 'instance-type'"),
+        ("spot", ["ondemand"], "Usage type 'spot' not supported with instance type 'instance-type'", None),
+        ("ondemand", ["spot"], "Usage type 'ondemand' not supported with instance type 'instance-type'", None),
+    ],
+)
+def test_check_usage_class(
+    mocker, usage_class, supported_usage_classes, expected_error_message, expected_warning_message
+):
+    # This test checks the common logic triggered from cluster_type_validator and queue_compute_type_validator.
+    instance_type_info_mock = mocker.MagicMock()
+    mocker.patch(
+        "pcluster.config.cfn_param_types.InstanceTypeInfo.init_from_instance_type", return_value=instance_type_info_mock
+    )
+    instance_type_info_mock.supported_usage_classes.return_value = supported_usage_classes
+
+    errors = []
+    warnings = []
+    check_usage_class("instance-type", usage_class, errors, warnings)
+
+    if expected_error_message:
+        assert_that(errors).contains(expected_error_message)
+    else:
+        assert_that(errors).is_empty()
+
+    if expected_warning_message:
+        assert_that(warnings).contains(expected_warning_message)
+    else:
+        assert_that(warnings).is_empty()
+
+
+@pytest.mark.parametrize(
+    "scheduler, expected_usage_class_check", [("sge", True), ("torque", True), ("slurm", True), ("awsbatch", False)]
+)
+def test_cluster_type_validator(mocker, scheduler, expected_usage_class_check):
+    # Usage class validation logic is tested in `test_check_usage_class`.
+    # This test only makes sure that the logic is triggered from validator.
+    mock = mocker.patch("pcluster.config.validators.check_usage_class", return_value=None)
+    cluster_dict = {"compute_instance_type": "t2.micro", "scheduler": scheduler}
+    config_parser_dict = {"cluster default": cluster_dict}
+    config_parser = configparser.ConfigParser()
+    config_parser.read_dict(config_parser_dict)
+
+    pcluster_config = utils.init_pcluster_config_from_configparser(config_parser, False, auto_refresh=False)
+    errors, warnings = cluster_type_validator("compute_type", "spot", pcluster_config)
+    if expected_usage_class_check:
+        mock.assert_called_with("t2.micro", "spot", [], [])
+    else:
+        mock.assert_not_called()
+
+    assert_that(errors).is_equal_to([])
+    assert_that(warnings).is_equal_to([])
+
+
+@pytest.mark.parametrize("compute_type", [("ondemand"), ("spot")])
+def test_queue_compute_type_validator(mocker, compute_type):
+    # Usage class validation logic is tested in `test_check_usage_class`.
+    # This test only makes sure that the logic is triggered from validator.
+    mock = mocker.patch("pcluster.config.validators.check_usage_class", return_value=None)
+
+    config_parser_dict = {
+        "cluster default": {
+            "queue_settings": "q1",
+        },
+        "queue q1": {"compute_resource_settings": "q1cr1, q1cr2", "compute_type": compute_type},
+        "compute_resource q1cr1": {"instance_type": "q1cr1_instance_type"},
+        "compute_resource q1cr2": {"instance_type": "q1cr2_instance_type"},
+    }
+
+    config_parser = configparser.ConfigParser()
+    config_parser.read_dict(config_parser_dict)
+
+    pcluster_config = utils.init_pcluster_config_from_configparser(config_parser, False, auto_refresh=False)
+    errors, warnings = queue_compute_type_validator("queue", "q1", pcluster_config)
+    mock.assert_has_calls(
+        [
+            mocker.call("q1cr1_instance_type", compute_type, [], []),
+            mocker.call("q1cr2_instance_type", compute_type, [], []),
+        ],
+        any_order=True,
+    )
+
+    assert_that(errors).is_equal_to([])
+    assert_that(warnings).is_equal_to([])
diff --git a/cli/tests/pcluster/config/utils.py b/cli/tests/pcluster/config/utils.py
@@ -140,6 +140,7 @@ def mock_instance_type_info(mocker, instance_type="t2.micro"):
                 "InstanceType": instance_type,
                 "VCpuInfo": {"DefaultVCpus": 4, "DefaultCores": 2},
                 "NetworkInfo": {"EfaSupported": False},
+                "SupportedUsageClasses": ["on-demand", "spot"],
             }
         ),
     )

Original file line number	Diff line number	Diff line change
`@@ -140,6 +140,7 @@ def mock_instance_type_info(mocker, instance_type="t2.micro"):`
`140`	`140`	`"InstanceType": instance_type,`
`141`	`141`	`"VCpuInfo": {"DefaultVCpus": 4, "DefaultCores": 2},`
`142`	`142`	`"NetworkInfo": {"EfaSupported": False},`
	`143`	`+ "SupportedUsageClasses": ["on-demand", "spot"],`
`143`	`144`	`}`
`144`	`145`	`),`
`145`	`146`	`)`