[integ-test] rotate instance type for some integration tests

hanwen-cluster · hgreebe · commit c9d859c403af · 2025-02-25T17:42:08.000-05:00
1. Unlike rotating OS, instance types are region dependent. Therefore, we cannot use general Jinja variables like `{{ OS_X86_1 }}`. We need to use region specific Jinja variables like `{{ US_EAST_1_INSTANCE_TYPE_0 }}`
2. For code efficiency, this commit only populates three large AWS regions. The code is extendable if more regions should be added.
3. This commit rotates instance types only on `test_essential_features` and `test_cluster_with_gpu_health_checks`. The code is extendable if more tests should be added.
4. Improve `test_cluster_with_gpu_health_checks` to be able to run on both x86 and arm

Signed-off-by: Hanwen &lt;hanwenli@amazon.com&gt;
diff --git a/tests/integration-tests/configs/develop.yaml b/tests/integration-tests/configs/develop.yaml
@@ -17,8 +17,8 @@ test-suites:
   basic:
     test_essential_features.py::test_essential_features:
       dimensions:
-        - regions: ["af-south-1"]
-          instances: {{ common.INSTANCES_DEFAULT_X86 }}
+        - regions: [{{ US_EAST_1_INSTANCE_TYPE_0_AZ }}]
+          instances: [{{ US_EAST_1_INSTANCE_TYPE_0 }}.xlarge]
           oss: [{{ OS_X86_1 }}]
           schedulers: ["slurm"]
   capacity_reservations:
@@ -288,8 +288,8 @@ test-suites:
   health_checks:
     test_gpu_health_checks.py::test_cluster_with_gpu_health_checks:
       dimensions:
-        - regions: ["eu-west-1"]
-          instances: {{ common.INSTANCES_DEFAULT_X86 }}
+        - regions: [{{ EU_WEST_1_GPU_INSTANCE_TYPE_0_AZ }}]
+          instances: [{{ EU_WEST_1_GPU_INSTANCE_TYPE_0 }}.xlarge]
           oss: [{{ OS_X86_5 }}]
           schedulers: ["slurm"]
   iam:
diff --git a/tests/integration-tests/framework/tests_configuration/config_renderer.py b/tests/integration-tests/framework/tests_configuration/config_renderer.py
@@ -13,6 +13,7 @@
 import os
 from datetime import date
 
+import boto3
 import yaml
 from jinja2 import FileSystemLoader
 from jinja2.sandbox import SandboxedEnvironment
@@ -60,6 +61,94 @@ def _get_os_parameters(config=None, args=None):
     return result
 
 
+def _get_instance_type_parameters():  # noqa: C901
+    """Gets Instance jinja parameters."""
+    result = {}
+    excluded_instance_type_prefixes = [
+        "m1",
+        "m2",
+        "m3",
+        "m4",
+        "t1",
+        "t2",
+        "c1",
+        "c3",
+        "c4",
+        "r3",
+        "r4",
+        "x1",
+        "x1e",
+        "d2",
+        "h1",
+        "i2",
+        "i3",
+        "f1",
+        "g3",
+        "p2",
+        "p3",
+    ]
+    for region in ["us-east-1", "us-west-2", "eu-west-1"]:  # Only populate instance type for big regions
+        ec2_client = boto3.client("ec2", region_name=region)
+        # The following conversion is required becase Python jinja doesn't like "-"
+        region_jinja = region.replace("-", "_").upper()
+        try:
+            xlarge_instances = []
+            instance_type_availability_zones = {}
+            # Use describe_instance_types with pagination
+            paginator = ec2_client.get_paginator("describe_instance_type_offerings")
+
+            for page in paginator.paginate(LocationType="availability-zone"):
+                for instance_type in page["InstanceTypeOfferings"]:
+                    # Check if instance type ends with '.xlarge'
+                    if instance_type["InstanceType"].endswith(".xlarge") and not any(
+                        instance_type["InstanceType"].startswith(prefix) for prefix in excluded_instance_type_prefixes
+                    ):
+                        xlarge_instances.append(instance_type["InstanceType"])
+                        if instance_type_availability_zones.get(instance_type["InstanceType"]):
+                            instance_type_availability_zones[instance_type["InstanceType"]].append(
+                                instance_type["Location"]
+                            )
+                        else:
+                            instance_type_availability_zones[instance_type["InstanceType"]] = [
+                                instance_type["Location"]
+                            ]
+
+            xlarge_instances = list(set(xlarge_instances))  # Remove redundancy.
+            gpu_instances = []
+            paginator = ec2_client.get_paginator("describe_instance_types")
+            for page in paginator.paginate(InstanceTypes=xlarge_instances):
+                for instance_type in page["InstanceTypes"]:
+                    if instance_type.get("GpuInfo"):
+                        gpu_instances.append(instance_type["InstanceType"])
+
+            xlarge_instances.sort()
+            gpu_instances.sort()
+            today_number = (date.today() - date(2020, 1, 1)).days
+            for index in range(len(xlarge_instances)):
+                instance_type = xlarge_instances[(today_number + index) % len(xlarge_instances)]
+                result[f"{region_jinja}_INSTANCE_TYPE_{index}"] = instance_type[: -len(".xlarge")]
+                availability_zones = instance_type_availability_zones[instance_type]
+                result[f"{region_jinja}_INSTANCE_TYPE_{index}_AZ"] = (
+                    availability_zones[0] if len(availability_zones) <= 2 else region
+                )
+            for index in range(len(gpu_instances)):
+                instance_type = gpu_instances[(today_number + index) % len(gpu_instances)]
+                result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}"] = instance_type[: -len(".xlarge")]
+                availability_zones = instance_type_availability_zones[instance_type]
+                result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}_AZ"] = (
+                    availability_zones[0] if len(availability_zones) <= 2 else region
+                )
+        except Exception as e:
+            print(f"Error getting instance types: {str(e)}. Using c5 and g4dn as the default instance type")
+            for index in range(100):
+                result[f"{region_jinja}_INSTANCE_TYPE_{index}"] = "c5"
+                result[f"{region_jinja}_INSTANCE_TYPE_{index}_AZ"] = region
+            for index in range(10):
+                result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}"] = "g4dn"
+                result[f"{region_jinja}_GPU_INSTANCE_TYPE_{index}_AZ"] = region
+    return result
+
+
 def _get_available_amis_oss(architecture, args=None, config=None):
     """
     Gets available AMIs for given architecture from input.
@@ -97,7 +186,9 @@ def read_config_file(config_file, print_rendered=False, config=None, args=None,
     :return: a dict containig the parsed config file
     """
     logging.info("Parsing config file: %s", config_file)
-    rendered_config = _render_config_file(config_file, **kwargs, **_get_os_parameters(config=config, args=args))
+    rendered_config = _render_config_file(
+        config_file, **kwargs, **_get_os_parameters(config=config, args=args), **_get_instance_type_parameters()
+    )
     try:
         return yaml.safe_load(rendered_config)
     except Exception:
diff --git a/tests/integration-tests/tests/health_checks/test_gpu_health_checks.py b/tests/integration-tests/tests/health_checks/test_gpu_health_checks.py
@@ -76,7 +76,11 @@ def test_cluster_with_gpu_health_checks(
             ),
         },
     }
-    cluster_config = pcluster_config_reader()
+    if architecture == "x86_64":
+        non_gpu_instance = "c5.xlarge"
+    else:
+        non_gpu_instance = "m6g.xlarge"
+    cluster_config = pcluster_config_reader(non_gpu_instance=non_gpu_instance)
     cluster = clusters_factory(cluster_config)
     assert_head_node_is_running(region, cluster)
     remote_command_executor = RemoteCommandExecutor(cluster)
diff --git a/tests/integration-tests/tests/health_checks/test_gpu_health_checks/test_cluster_with_gpu_health_checks/pcluster.config.yaml b/tests/integration-tests/tests/health_checks/test_gpu_health_checks/test_cluster_with_gpu_health_checks/pcluster.config.yaml
@@ -1,7 +1,7 @@
 Image:
   Os: {{ os }}
 HeadNode:
-  InstanceType: {{ instance }}
+  InstanceType: {{ non_gpu_instance }}
   Networking:
     SubnetId: {{ public_subnet_id }}
   Ssh:
@@ -18,49 +18,49 @@ Scheduling:
     ComputeResources:
     - Name: compute-resource-1
       Instances:
-        - InstanceType: g4dn.xlarge
+        - InstanceType: {{ instance }}
       HealthChecks:
         Gpu:
           Enabled: false
     - Name: compute-resource-2
       Instances:
-        - InstanceType: g4dn.xlarge
+        - InstanceType: {{ instance }}
       HealthChecks:
         Gpu:
           Enabled: true
     - Name: compute-resource-3
       Instances:
-        - InstanceType: g4dn.xlarge
+        - InstanceType: {{ instance }}
       MinCount: 1
     - Name: compute-resource-4
       Instances:
-        - InstanceType: c5.xlarge
+        - InstanceType: {{ non_gpu_instance }}
       HealthChecks:
         Gpu:
           Enabled: false
     - Name: compute-resource-5
       Instances:
-        - InstanceType: c5.xlarge
+        - InstanceType: {{ non_gpu_instance }}
       HealthChecks:
         Gpu:
           Enabled: true
     - Name: compute-resource-6
       Instances:
-        - InstanceType: c5.xlarge
+        - InstanceType: {{ non_gpu_instance }}
     Networking:
       SubnetIds:
         - {{ private_subnet_id }}
   - Name: queue-2
     ComputeResources:
     - Name: compute-resource-1
       Instances:
-        - InstanceType: g4dn.xlarge
+        - InstanceType: {{ instance }}
       HealthChecks:
         Gpu:
           Enabled: true
     - Name: compute-resource-2
       Instances:
-        - InstanceType: c5.xlarge
+        - InstanceType: {{ non_gpu_instance }}
       HealthChecks:
         Gpu:
           Enabled: true