Skip to content

Commit c9d859c

Browse files
hanwen-clusterhgreebe
authored andcommitted
[integ-test] rotate instance type for some integration tests
1. Unlike rotating OS, instance types are region dependent. Therefore, we cannot use general Jinja variables like `{{ OS_X86_1 }}`. We need to use region specific Jinja variables like `{{ US_EAST_1_INSTANCE_TYPE_0 }}` 2. For code efficiency, this commit only populates three large AWS regions. The code is extendable if more regions should be added. 3. This commit rotates instance types only on `test_essential_features` and `test_cluster_with_gpu_health_checks`. The code is extendable if more tests should be added. 4. Improve `test_cluster_with_gpu_health_checks` to be able to run on both x86 and arm Signed-off-by: Hanwen <[email protected]>
1 parent c114c0f commit c9d859c

File tree

4 files changed

+110
-15
lines changed

4 files changed

+110
-15
lines changed

tests/integration-tests/configs/develop.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ test-suites:
1717
basic:
1818
test_essential_features.py::test_essential_features:
1919
dimensions:
20-
- regions: ["af-south-1"]
21-
instances: {{ common.INSTANCES_DEFAULT_X86 }}
20+
- regions: [{{ US_EAST_1_INSTANCE_TYPE_0_AZ }}]
21+
instances: [{{ US_EAST_1_INSTANCE_TYPE_0 }}.xlarge]
2222
oss: [{{ OS_X86_1 }}]
2323
schedulers: ["slurm"]
2424
capacity_reservations:
@@ -288,8 +288,8 @@ test-suites:
288288
health_checks:
289289
test_gpu_health_checks.py::test_cluster_with_gpu_health_checks:
290290
dimensions:
291-
- regions: ["eu-west-1"]
292-
instances: {{ common.INSTANCES_DEFAULT_X86 }}
291+
- regions: [{{ EU_WEST_1_GPU_INSTANCE_TYPE_0_AZ }}]
292+
instances: [{{ EU_WEST_1_GPU_INSTANCE_TYPE_0 }}.xlarge]
293293
oss: [{{ OS_X86_5 }}]
294294
schedulers: ["slurm"]
295295
iam:

tests/integration-tests/framework/tests_configuration/config_renderer.py

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import os
1414
from datetime import date
1515

16+
import boto3
1617
import yaml
1718
from jinja2 import FileSystemLoader
1819
from jinja2.sandbox import SandboxedEnvironment
@@ -60,6 +61,94 @@ def _get_os_parameters(config=None, args=None):
6061
return result
6162

6263

64+
def _get_instance_type_parameters(): # noqa: C901
65+
"""Gets Instance jinja parameters."""
66+
result = {}
67+
excluded_instance_type_prefixes = [
68+
"m1",
69+
"m2",
70+
"m3",
71+
"m4",
72+
"t1",
73+
"t2",
74+
"c1",
75+
"c3",
76+
"c4",
77+
"r3",
78+
"r4",
79+
"x1",
80+
"x1e",
81+
"d2",
82+
"h1",
83+
"i2",
84+
"i3",
85+
"f1",
86+
"g3",
87+
"p2",
88+
"p3",
89+
]
90+
for region in ["us-east-1", "us-west-2", "eu-west-1"]: # Only populate instance type for big regions
91+
ec2_client = boto3.client("ec2", region_name=region)
92+
# The following conversion is required becase Python jinja doesn't like "-"
93+
region_jinja = region.replace("-", "_").upper()
94+
try:
95+
xlarge_instances = []
96+
instance_type_availability_zones = {}
97+
# Use describe_instance_types with pagination
98+
paginator = ec2_client.get_paginator("describe_instance_type_offerings")
99+
100+
for page in paginator.paginate(LocationType="availability-zone"):
101+
for instance_type in page["InstanceTypeOfferings"]:
102+
# Check if instance type ends with '.xlarge'
103+
if instance_type["InstanceType"].endswith(".xlarge") and not any(
104+
instance_type["InstanceType"].startswith(prefix) for prefix in excluded_instance_type_prefixes
105+
):
106+
xlarge_instances.append(instance_type["InstanceType"])
107+
if instance_type_availability_zones.get(instance_type["InstanceType"]):
108+
instance_type_availability_zones[instance_type["InstanceType"]].append(
109+
instance_type["Location"]
110+
)
111+
else:
112+
instance_type_availability_zones[instance_type["InstanceType"]] = [
113+
instance_type["Location"]
114+
]
115+
116+
xlarge_instances = list(set(xlarge_instances)) # Remove redundancy.
117+
gpu_instances = []
118+
paginator = ec2_client.get_paginator("describe_instance_types")
119+
for page in paginator.paginate(InstanceTypes=xlarge_instances):
120+
for instance_type in page["InstanceTypes"]:
121+
if instance_type.get("GpuInfo"):
122+
gpu_instances.append(instance_type["InstanceType"])
123+
124+
xlarge_instances.sort()
125+
gpu_instances.sort()
126+
today_number = (date.today() - date(2020, 1, 1)).days
127+
for index in range(len(xlarge_instances)):
128+
instance_type = xlarge_instances[(today_number + index) % len(xlarge_instances)]
129+
result[f"{region_jinja}_INSTANCE_TYPE_{index}"] = instance_type[: -len(".xlarge")]
130+
availability_zones = instance_type_availability_zones[instance_type]
131+
result[f"{region_jinja}_INSTANCE_TYPE_{index}_AZ"] = (
132+
availability_zones[0] if len(availability_zones) <= 2 else region
133+
)
134+
for index in range(len(gpu_instances)):
135+
instance_type = gpu_instances[(today_number + index) % len(gpu_instances)]
136+
result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}"] = instance_type[: -len(".xlarge")]
137+
availability_zones = instance_type_availability_zones[instance_type]
138+
result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}_AZ"] = (
139+
availability_zones[0] if len(availability_zones) <= 2 else region
140+
)
141+
except Exception as e:
142+
print(f"Error getting instance types: {str(e)}. Using c5 and g4dn as the default instance type")
143+
for index in range(100):
144+
result[f"{region_jinja}_INSTANCE_TYPE_{index}"] = "c5"
145+
result[f"{region_jinja}_INSTANCE_TYPE_{index}_AZ"] = region
146+
for index in range(10):
147+
result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}"] = "g4dn"
148+
result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}_AZ"] = region
149+
return result
150+
151+
63152
def _get_available_amis_oss(architecture, args=None, config=None):
64153
"""
65154
Gets available AMIs for given architecture from input.
@@ -97,7 +186,9 @@ def read_config_file(config_file, print_rendered=False, config=None, args=None,
97186
:return: a dict containig the parsed config file
98187
"""
99188
logging.info("Parsing config file: %s", config_file)
100-
rendered_config = _render_config_file(config_file, **kwargs, **_get_os_parameters(config=config, args=args))
189+
rendered_config = _render_config_file(
190+
config_file, **kwargs, **_get_os_parameters(config=config, args=args), **_get_instance_type_parameters()
191+
)
101192
try:
102193
return yaml.safe_load(rendered_config)
103194
except Exception:

tests/integration-tests/tests/health_checks/test_gpu_health_checks.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,11 @@ def test_cluster_with_gpu_health_checks(
7676
),
7777
},
7878
}
79-
cluster_config = pcluster_config_reader()
79+
if architecture == "x86_64":
80+
non_gpu_instance = "c5.xlarge"
81+
else:
82+
non_gpu_instance = "m6g.xlarge"
83+
cluster_config = pcluster_config_reader(non_gpu_instance=non_gpu_instance)
8084
cluster = clusters_factory(cluster_config)
8185
assert_head_node_is_running(region, cluster)
8286
remote_command_executor = RemoteCommandExecutor(cluster)

tests/integration-tests/tests/health_checks/test_gpu_health_checks/test_cluster_with_gpu_health_checks/pcluster.config.yaml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Image:
22
Os: {{ os }}
33
HeadNode:
4-
InstanceType: {{ instance }}
4+
InstanceType: {{ non_gpu_instance }}
55
Networking:
66
SubnetId: {{ public_subnet_id }}
77
Ssh:
@@ -18,49 +18,49 @@ Scheduling:
1818
ComputeResources:
1919
- Name: compute-resource-1
2020
Instances:
21-
- InstanceType: g4dn.xlarge
21+
- InstanceType: {{ instance }}
2222
HealthChecks:
2323
Gpu:
2424
Enabled: false
2525
- Name: compute-resource-2
2626
Instances:
27-
- InstanceType: g4dn.xlarge
27+
- InstanceType: {{ instance }}
2828
HealthChecks:
2929
Gpu:
3030
Enabled: true
3131
- Name: compute-resource-3
3232
Instances:
33-
- InstanceType: g4dn.xlarge
33+
- InstanceType: {{ instance }}
3434
MinCount: 1
3535
- Name: compute-resource-4
3636
Instances:
37-
- InstanceType: c5.xlarge
37+
- InstanceType: {{ non_gpu_instance }}
3838
HealthChecks:
3939
Gpu:
4040
Enabled: false
4141
- Name: compute-resource-5
4242
Instances:
43-
- InstanceType: c5.xlarge
43+
- InstanceType: {{ non_gpu_instance }}
4444
HealthChecks:
4545
Gpu:
4646
Enabled: true
4747
- Name: compute-resource-6
4848
Instances:
49-
- InstanceType: c5.xlarge
49+
- InstanceType: {{ non_gpu_instance }}
5050
Networking:
5151
SubnetIds:
5252
- {{ private_subnet_id }}
5353
- Name: queue-2
5454
ComputeResources:
5555
- Name: compute-resource-1
5656
Instances:
57-
- InstanceType: g4dn.xlarge
57+
- InstanceType: {{ instance }}
5858
HealthChecks:
5959
Gpu:
6060
Enabled: true
6161
- Name: compute-resource-2
6262
Instances:
63-
- InstanceType: c5.xlarge
63+
- InstanceType: {{ non_gpu_instance }}
6464
HealthChecks:
6565
Gpu:
6666
Enabled: true

0 commit comments

Comments
 (0)