Skip to content

Commit c339547

Browse files
[integ-tests] Add p6-b200 to test_efa
p6-b200 requires a capacity block. This commit makes the test to automatically find the capacity block by reusing the same logic as test_gb200. This requires a little bit of refactor Using capacity block requires using RunInstances instead of CreateFleet. Therefore, this commit changes the cluster configuration file of test_efa to always use RunInstances
1 parent 9a3ef15 commit c339547

File tree

5 files changed

+60
-30
lines changed

5 files changed

+60
-30
lines changed

tests/integration-tests/configs/develop.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,10 @@ test-suites:
264264
instances: ["p4d.24xlarge"]
265265
oss: [{{ NO_RHEL_OS_X86_1 }}] # The capacity reservation cannot use RHEL operating system
266266
schedulers: ["slurm"]
267+
- regions: [ "use1-az6" ] # do not move, unless capacity reservation is moved as well
268+
instances: [ "p6-b200.48xlarge" ]
269+
oss: ["alinux2023", "ubuntu2204", "ubuntu2404", "rocky8", "rocky9"]
270+
schedulers: [ "slurm" ]
267271
- regions: [{{ c6gn_16xlarge_CAPACITY_RESERVATION_2_INSTANCES_2_HOURS_YESPG_OS_ARM_0 }}]
268272
instances: ["c6gn.16xlarge"]
269273
oss: [{{ OS_ARM_0 }}]

tests/integration-tests/tests/common/utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,3 +552,22 @@ def terminate_nodes_manually(instance_ids, region):
552552
assert_that(instance_states.get("InstanceId")).is_equal_to(instance_id)
553553
assert_that(instance_states.get("CurrentState").get("Name")).is_in("shutting-down", "terminated")
554554
logging.info("Terminated nodes: {}".format(instance_ids))
555+
556+
557+
def get_capacity_reservation_id(instance_type, region, count):
558+
ec2_client = boto3.client("ec2", region_name=region)
559+
paginator = ec2_client.get_paginator("describe_capacity_reservations")
560+
# List to store matching reservation IDs
561+
reservations_ids = []
562+
# Paginate through the results
563+
for page in paginator.paginate():
564+
for reservation in page.get("CapacityReservations", []):
565+
if instance_type == reservation.get("InstanceType") and reservation.get("AvailableInstanceCount") >= count:
566+
reservations_ids.append(
567+
{
568+
"CapacityReservationId": reservation["CapacityReservationId"],
569+
"TotalInstanceCount": reservation["TotalInstanceCount"],
570+
"AvailableInstanceCount": reservation["AvailableInstanceCount"],
571+
}
572+
)
573+
return reservations_ids

tests/integration-tests/tests/efa/test_efa.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
# See the License for the specific language governing permissions and limitations under the License.
1212
import logging
1313

14+
import pytest
1415
import xmltodict
1516
from assertpy import assert_that, soft_assertions
1617
from remote_command_executor import RemoteCommandExecutor
@@ -19,13 +20,20 @@
1920
from tests.common.assertions import assert_no_errors_in_logs
2021
from tests.common.mpi_common import _test_mpi
2122
from tests.common.nccl_common import install_and_run_nccl_benchmarks
22-
from tests.common.utils import fetch_instance_slots, read_remote_file, run_system_analyzer, wait_process_completion
23+
from tests.common.utils import (
24+
fetch_instance_slots,
25+
get_capacity_reservation_id,
26+
read_remote_file,
27+
run_system_analyzer,
28+
wait_process_completion,
29+
)
2330

2431
FABTESTS_BASIC_TESTS = ["rdm_tagged_bw", "rdm_tagged_pingpong"]
2532

2633
FABTESTS_GDRCOPY_TESTS = ["runt"]
2734

2835

36+
@pytest.mark.usefixtures("serial_execution_by_instance")
2937
def test_efa(
3038
os,
3139
region,
@@ -47,9 +55,21 @@ def test_efa(
4755
head_node_instance = "c5.18xlarge"
4856
else:
4957
head_node_instance = "c6g.16xlarge"
58+
max_queue_size = 2
59+
p6_b200_capacity_reservation_id = None
60+
if instance == "p6-b200.48xlarge":
61+
capacity_reservations_ids = get_capacity_reservation_id(instance, region, max_queue_size)
62+
if capacity_reservations_ids:
63+
p6_b200_capacity_reservation_id = capacity_reservations_ids[0].get("CapacityReservationId")
64+
else:
65+
pytest.skip(f"Skipping the test No Capacity Block for {instance} was found in {region}")
5066

5167
slots_per_instance = fetch_instance_slots(region, instance, multithreading_disabled=True)
52-
cluster_config = pcluster_config_reader(head_node_instance=head_node_instance)
68+
cluster_config = pcluster_config_reader(
69+
head_node_instance=head_node_instance,
70+
max_queue_size=max_queue_size,
71+
p6_b200_capacity_reservation_id=p6_b200_capacity_reservation_id,
72+
)
5373
cluster = clusters_factory(cluster_config)
5474
remote_command_executor = RemoteCommandExecutor(cluster)
5575
scheduler_commands = scheduler_commands_factory(remote_command_executor)

tests/integration-tests/tests/efa/test_efa/test_efa/pcluster.config.yaml

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,22 @@ Scheduling:
1313
Scheduler: {{ scheduler }}
1414
SlurmQueues:
1515
- Name: efa-enabled
16+
{% if p6_b200_capacity_reservation_id %}
17+
CapacityType: CAPACITY_BLOCK
18+
CapacityReservationTarget:
19+
CapacityReservationId: {{ p6_b200_capacity_reservation_id }}
20+
{% endif %}
1621
Networking:
1722
PlacementGroup:
18-
Enabled: {% if instance not in ["p4d.24xlarge"] %}true{% else %}false{% endif %}
23+
Enabled: {% if instance not in ["p4d.24xlarge", "p6-b200.48xlarge"] %}true{% else %}false{% endif %}
1924
{% if instance in ["c5n.18xlarge", "c6gn.16xlarge"] %}Name: {{ capacity_reservation_framework_placement_group }}{% endif %}
2025
SubnetIds:
2126
- {{ private_subnet_id }}
2227
ComputeResources:
2328
- Name: efa-enabled-i1
24-
{% if "us-iso" in region %}
2529
InstanceType: {{ instance }}
26-
{% else %}
27-
Instances:
28-
- InstanceType: {{ instance }}
29-
{% endif %}
30-
MaxCount: 2
31-
MinCount: 2
30+
MaxCount: {{ max_queue_size }}
31+
MinCount: {{ max_queue_size }}
3232
DisableSimultaneousMultithreading: true
3333
Efa:
3434
Enabled: true

tests/integration-tests/tests/ultraserver/test_gb200.py

Lines changed: 7 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,13 @@
2525
from tests.common.mpi_common import _test_mpi
2626
from tests.common.nccl_common import install_and_run_nccl_benchmarks
2727
from tests.common.schedulers_common import SlurmCommands
28-
from tests.common.utils import fetch_instance_slots, is_existing_remote_file, read_remote_file, terminate_nodes_manually
28+
from tests.common.utils import (
29+
fetch_instance_slots,
30+
get_capacity_reservation_id,
31+
is_existing_remote_file,
32+
read_remote_file,
33+
terminate_nodes_manually,
34+
)
2935

3036
# We use placeholder IPs just to get IMEX started.
3137
# These values are hardwired in the cookbook.
@@ -339,25 +345,6 @@ def assert_topology_plugin_completely_disabled(cluster: Cluster):
339345
logging.info("TopologyPlugin correctly completely disabled")
340346

341347

342-
def get_capacity_reservation_id(instance_type, region, count):
343-
ec2_client = boto3.client("ec2", region_name=region)
344-
paginator = ec2_client.get_paginator("describe_capacity_reservations")
345-
# List to store matching reservation IDs
346-
reservations_ids = []
347-
# Paginate through the results
348-
for page in paginator.paginate():
349-
for reservation in page.get("CapacityReservations", []):
350-
if instance_type == reservation.get("InstanceType") and reservation.get("AvailableInstanceCount") >= count:
351-
reservations_ids.append(
352-
{
353-
"CapacityReservationId": reservation["CapacityReservationId"],
354-
"TotalInstanceCount": reservation["TotalInstanceCount"],
355-
"AvailableInstanceCount": reservation["AvailableInstanceCount"],
356-
}
357-
)
358-
return reservations_ids
359-
360-
361348
@pytest.mark.usefixtures("serial_execution_by_instance")
362349
@pytest.mark.usefixtures("os")
363350
def test_gb200(

0 commit comments

Comments
 (0)