Skip to content

Commit 553838f

Browse files
committed
Add sanity check for instance usage classes
This commit adds a validator for usage classes ("spot" or "ondemand") assigned to cluster nodes through the `cluster_type` (in `cluster` section) or `compute_type` (in `queue` section) configuration parameters. Signed-off-by: ddeidda <[email protected]>
1 parent ceb3b39 commit 553838f

File tree

6 files changed

+153
-3
lines changed

6 files changed

+153
-3
lines changed

CHANGELOG.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ CHANGELOG
55
------
66
**ENHANCEMENTS**
77

8-
- Enable support for ARM instances in China and GovCloud regions when using Ubuntu 18.04 or Amazon Linux 2.
8+
- Enable support for ARM instances in China and GovCloud regions when using Ubuntu 18.04 or Amazon Linux 2.
9+
- Add validation for `cluster_type` configuration parameter in `cluster` section
10+
- Add validation for `compute_type` configuration parameter in `queue` section
11+
912

1013
**CHANGES**
1114

cli/src/pcluster/config/mappings.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
from pcluster.config.validators import (
5959
architecture_os_validator,
6060
base_os_validator,
61+
cluster_type_validator,
6162
cluster_validator,
6263
compute_instance_type_validator,
6364
compute_resource_validator,
@@ -99,6 +100,7 @@
99100
intel_hpc_os_validator,
100101
kms_key_validator,
101102
maintain_initial_size_validator,
103+
queue_compute_type_validator,
102104
queue_settings_validator,
103105
queue_validator,
104106
s3_bucket_uri_validator,
@@ -723,7 +725,7 @@
723725
"type": QueueJsonSection,
724726
"key": "queue",
725727
"default_label": "default",
726-
"validators": [queue_validator],
728+
"validators": [queue_validator, queue_compute_type_validator],
727729
"max_resources": 5,
728730
"params": OrderedDict([
729731
("compute_type", {
@@ -1103,6 +1105,7 @@
11031105
"default": "ondemand",
11041106
"allowed_values": ["ondemand", "spot"],
11051107
"cfn_param_mapping": "ClusterType",
1108+
"validators": [cluster_type_validator],
11061109
"update_policy": UpdatePolicy.COMPUTE_FLEET_STOP
11071110
}),
11081111
("spot_price", {

cli/src/pcluster/config/validators.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1185,7 +1185,7 @@ def check_unsupported_feature(compute_resource, feature_name, param_key):
11851185

11861186
instance_types = []
11871187
for compute_resource_label in compute_resource_labels:
1188-
compute_resource = pcluster_config.get_section("compute_resource", compute_resource_label)
1188+
compute_resource = pcluster_config.get_section("compute_resource", compute_resource_label.strip())
11891189
if compute_resource:
11901190
instance_type = compute_resource.get_param_value("instance_type")
11911191
if instance_type in instance_types:
@@ -1208,6 +1208,21 @@ def check_unsupported_feature(compute_resource, feature_name, param_key):
12081208
return errors, warnings
12091209

12101210

1211+
def queue_compute_type_validator(section_key, section_label, pcluster_config):
1212+
errors = []
1213+
warnings = []
1214+
queue_section = pcluster_config.get_section(section_key, section_label)
1215+
compute_resource_labels = str(queue_section.get_param_value("compute_resource_settings") or "").split(",")
1216+
1217+
for compute_resource_label in compute_resource_labels:
1218+
# Check that usage class set in queue section is supported by all compute resource instance types
1219+
compute_resource = pcluster_config.get_section("compute_resource", compute_resource_label.strip())
1220+
if compute_resource:
1221+
instance_type = compute_resource.get_param_value("instance_type")
1222+
check_usage_class(instance_type, queue_section.get_param_value("compute_type"), errors, warnings)
1223+
return errors, warnings
1224+
1225+
12111226
def settings_validator(param_key, param_value, pcluster_config):
12121227
errors = []
12131228
if param_value:
@@ -1601,3 +1616,26 @@ def ebs_volume_throughput_validator(section_key, section_label, pcluster_config)
16011616
)
16021617

16031618
return errors, warnings
1619+
1620+
1621+
def cluster_type_validator(param_key, param_value, pcluster_config):
1622+
errors = []
1623+
warnings = []
1624+
1625+
scheduler = pcluster_config.get_section("cluster").get_param_value("scheduler")
1626+
if scheduler != "awsbatch":
1627+
compute_instance_type = pcluster_config.get_section("cluster").get_param_value("compute_instance_type")
1628+
check_usage_class(compute_instance_type, param_value, errors, warnings)
1629+
1630+
return errors, warnings
1631+
1632+
1633+
def check_usage_class(instance_type, usage_class, errors, warnings):
1634+
supported_usage_classes = InstanceTypeInfo.init_from_instance_type(instance_type).supported_usage_classes()
1635+
1636+
if not supported_usage_classes:
1637+
warnings.append(
1638+
"Could not check support for usage class '{0}' with instance type '{1}'".format(usage_class, instance_type)
1639+
)
1640+
elif usage_class not in supported_usage_classes:
1641+
errors.append("Usage type '{0}' not supported with instance type '{1}'".format(usage_class, instance_type))

cli/src/pcluster/utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1350,3 +1350,12 @@ def supported_architecture(self):
13501350
def is_efa_supported(self):
13511351
"""Check whether EFA is supported."""
13521352
return self.instance_type_data.get("NetworkInfo").get("EfaSupported")
1353+
1354+
def supported_usage_classes(self):
1355+
"""Return the list supported usage classes."""
1356+
supported_classes = self.instance_type_data.get("SupportedUsageClasses", [])
1357+
if "on-demand" in supported_classes:
1358+
# Replace official AWS with internal naming convention
1359+
supported_classes.remove("on-demand")
1360+
supported_classes.append("ondemand")
1361+
return supported_classes

cli/tests/pcluster/config/test_validators.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,16 @@
2727
FSX_SUPPORTED_ARCHITECTURES_OSES,
2828
LOGFILE_LOGGER,
2929
architecture_os_validator,
30+
check_usage_class,
31+
cluster_type_validator,
3032
compute_resource_validator,
3133
disable_hyperthreading_architecture_validator,
3234
efa_gdr_validator,
3335
efa_os_arch_validator,
3436
fsx_ignored_parameters_validator,
3537
instances_architecture_compatibility_validator,
3638
intel_hpc_architecture_validator,
39+
queue_compute_type_validator,
3740
queue_validator,
3841
settings_validator,
3942
)
@@ -2808,3 +2811,96 @@ def test_efa_os_arch_validator(mocker, cluster_dict, architecture, expected_erro
28082811
def test_ebs_volume_throughput_validator(mocker, section_dict, expected_message):
28092812
config_parser_dict = {"cluster default": {"ebs_settings": "default"}, "ebs default": section_dict}
28102813
utils.assert_param_validator(mocker, config_parser_dict, expected_message)
2814+
2815+
2816+
@pytest.mark.parametrize(
2817+
"usage_class, supported_usage_classes, expected_error_message, expected_warning_message",
2818+
[
2819+
("ondemand", ["ondemand", "spot"], None, None),
2820+
("spot", ["ondemand", "spot"], None, None),
2821+
("ondemand", ["ondemand"], None, None),
2822+
("spot", ["spot"], None, None),
2823+
("spot", [], None, "Could not check support for usage class 'spot' with instance type 'instance-type'"),
2824+
("ondemand", [], None, "Could not check support for usage class 'ondemand' with instance type 'instance-type'"),
2825+
("spot", ["ondemand"], "Usage type 'spot' not supported with instance type 'instance-type'", None),
2826+
("ondemand", ["spot"], "Usage type 'ondemand' not supported with instance type 'instance-type'", None),
2827+
],
2828+
)
2829+
def test_check_usage_class(
2830+
mocker, usage_class, supported_usage_classes, expected_error_message, expected_warning_message
2831+
):
2832+
# This test checks the common logic triggered from cluster_type_validator and queue_compute_type_validator.
2833+
instance_type_info_mock = mocker.MagicMock()
2834+
mocker.patch(
2835+
"pcluster.config.cfn_param_types.InstanceTypeInfo.init_from_instance_type", return_value=instance_type_info_mock
2836+
)
2837+
instance_type_info_mock.supported_usage_classes.return_value = supported_usage_classes
2838+
2839+
errors = []
2840+
warnings = []
2841+
check_usage_class("instance-type", usage_class, errors, warnings)
2842+
2843+
if expected_error_message:
2844+
assert_that(errors).contains(expected_error_message)
2845+
else:
2846+
assert_that(errors).is_empty()
2847+
2848+
if expected_warning_message:
2849+
assert_that(warnings).contains(expected_warning_message)
2850+
else:
2851+
assert_that(warnings).is_empty()
2852+
2853+
2854+
@pytest.mark.parametrize(
2855+
"scheduler, expected_usage_class_check", [("sge", True), ("torque", True), ("slurm", True), ("awsbatch", False)]
2856+
)
2857+
def test_cluster_type_validator(mocker, scheduler, expected_usage_class_check):
2858+
# Usage class validation logic is tested in `test_check_usage_class`.
2859+
# This test only makes sure that the logic is triggered from validator.
2860+
mock = mocker.patch("pcluster.config.validators.check_usage_class", return_value=None)
2861+
cluster_dict = {"compute_instance_type": "t2.micro", "scheduler": scheduler}
2862+
config_parser_dict = {"cluster default": cluster_dict}
2863+
config_parser = configparser.ConfigParser()
2864+
config_parser.read_dict(config_parser_dict)
2865+
2866+
pcluster_config = utils.init_pcluster_config_from_configparser(config_parser, False, auto_refresh=False)
2867+
errors, warnings = cluster_type_validator("compute_type", "spot", pcluster_config)
2868+
if expected_usage_class_check:
2869+
mock.assert_called_with("t2.micro", "spot", [], [])
2870+
else:
2871+
mock.assert_not_called()
2872+
2873+
assert_that(errors).is_equal_to([])
2874+
assert_that(warnings).is_equal_to([])
2875+
2876+
2877+
@pytest.mark.parametrize("compute_type", [("ondemand"), ("spot")])
2878+
def test_queue_compute_type_validator(mocker, compute_type):
2879+
# Usage class validation logic is tested in `test_check_usage_class`.
2880+
# This test only makes sure that the logic is triggered from validator.
2881+
mock = mocker.patch("pcluster.config.validators.check_usage_class", return_value=None)
2882+
2883+
config_parser_dict = {
2884+
"cluster default": {
2885+
"queue_settings": "q1",
2886+
},
2887+
"queue q1": {"compute_resource_settings": "q1cr1, q1cr2", "compute_type": compute_type},
2888+
"compute_resource q1cr1": {"instance_type": "q1cr1_instance_type"},
2889+
"compute_resource q1cr2": {"instance_type": "q1cr2_instance_type"},
2890+
}
2891+
2892+
config_parser = configparser.ConfigParser()
2893+
config_parser.read_dict(config_parser_dict)
2894+
2895+
pcluster_config = utils.init_pcluster_config_from_configparser(config_parser, False, auto_refresh=False)
2896+
errors, warnings = queue_compute_type_validator("queue", "q1", pcluster_config)
2897+
mock.assert_has_calls(
2898+
[
2899+
mocker.call("q1cr1_instance_type", compute_type, [], []),
2900+
mocker.call("q1cr2_instance_type", compute_type, [], []),
2901+
],
2902+
any_order=True,
2903+
)
2904+
2905+
assert_that(errors).is_equal_to([])
2906+
assert_that(warnings).is_equal_to([])

cli/tests/pcluster/config/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ def mock_instance_type_info(mocker, instance_type="t2.micro"):
140140
"InstanceType": instance_type,
141141
"VCpuInfo": {"DefaultVCpus": 4, "DefaultCores": 2},
142142
"NetworkInfo": {"EfaSupported": False},
143+
"SupportedUsageClasses": ["on-demand", "spot"],
143144
}
144145
),
145146
)

0 commit comments

Comments
 (0)