Skip to content

Commit a21d31c

Browse files
[integ-tests-framework] Support dynamic capacity reservations for multiple instance types
Refactor capacity reservation logic in integration tests to support reserving multiple instance types in the same availability zone. Changes include: * Support __ separator syntax to specify multiple instance types in a single capacity reservation variable * Add _create_capacity_reservations function to handle multi-instance-type reservations atomically (rollback on partial failure) * Refactor _create_capacity_reservation to _create_single_capacity_reservation that returns reservation ID for tracking * Skip existing reservation check for multi-instance-type reservations due to complexity
1 parent 356daf7 commit a21d31c

File tree

2 files changed

+102
-71
lines changed

2 files changed

+102
-71
lines changed

tests/integration-tests/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,7 @@ The framework includes automatic capacity reservation management for tests that
346346
- Automatically detects capacity reservation requirements in test configuration files using Jinja variables
347347
- Creates or modifies existing EC2 capacity reservations as needed
348348
- Supports placement groups and time-limited reservations
349+
- Supports several instance types in the same availability zone using `__` separator. For example, you can reserve 100 c5.xlarge for compute nodes and 1 c5n.18xlarge for the head node.
349350
- Falls back to default availability zones if reservations cannot be created
350351

351352
Use Jinja variables in test configs with the pattern:
@@ -354,6 +355,12 @@ Use Jinja variables in test configs with the pattern:
354355
- regions: [{{ INSTANCE_TYPE_CAPACITY_RESERVATION_COUNT_INSTANCES_HOURS_HOURS_[YESPG|NOPG]_[OS] }}]
355356
```
356357

358+
For several instance types in the same AZ, use `__` to separate specifications:
359+
```
360+
dimensions:
361+
- regions: [{{ c5_xlarge_CAPACITY_RESERVATION_2_INSTANCES_2_HOURS_NOPG_alinux2023__m5_xlarge_CAPACITY_RESERVATION_2_INSTANCES_2_HOURS_NOPG_alinux2023 }}]
362+
```
363+
357364
Example:
358365
```
359366
# Reserve 2 c5.xlarge instances for 2 hours with placement group

tests/integration-tests/framework/tests_configuration/config_renderer.py

Lines changed: 95 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -313,40 +313,70 @@ def _check_or_create_capacity_reservations(config_file, os_parameters, instance_
313313
for var in variables:
314314
if "CAPACITY_RESERVATION" in var:
315315
logging.info("Checking capacity reservation for %s", var)
316-
count, enable_placement_group, hours, instance_type, os = _parse_capacity_reservation_variable(var)
317-
instance_type, os_platform = _resolve_instance_type_and_os(
318-
instance_type, instance_type_parameters, os, os_parameters
319-
)
320-
end_date = datetime.now(timezone.utc) + timedelta(hours=hours)
316+
specs = []
317+
for part in var.split("__"): # Support multiple instance types separated by __
318+
count, enable_placement_group, hours, instance_type, os = _parse_capacity_reservation_variable(part)
319+
instance_type, os_platform = _resolve_instance_type_and_os(
320+
instance_type, instance_type_parameters, os, os_parameters
321+
)
322+
end_date = datetime.now(timezone.utc) + timedelta(hours=hours)
323+
specs.append((instance_type, os_platform, count, end_date, enable_placement_group))
321324
candidate_regions = ["us-east-1", "us-east-2", "us-west-2", "eu-west-1"]
322-
if _find_and_modify_existing_capacity_reservation(
323-
az_for_capacity_reservation, candidate_regions, count, end_date, instance_type, var, os_platform
324-
):
325-
continue
326-
capacity_reservation_created = False
327-
try:
328-
for region in candidate_regions:
329-
ec2_client = boto3.client("ec2", region_name=region)
330-
capacity_reservation_created = _create_capacity_reservation(
331-
az_for_capacity_reservation,
332-
count,
333-
ec2_client,
334-
end_date,
335-
instance_type,
336-
var,
337-
os_platform,
338-
enable_placement_group,
339-
)
340-
if capacity_reservation_created:
341-
break
342-
except Exception:
343-
az_for_capacity_reservation[var] = "use1-az6"
344-
if not capacity_reservation_created:
345-
# Assign arbitrary zone if no reservations can be made
325+
if len(specs) == 1:
326+
# Single instance type reservation: check for existing reservation to be frugal.
327+
# It is hard to implement such logic for multiple instance types reservation.
328+
# Therefore, for multiple instance types, skip the logic and make reservation directly.
329+
instance_type, os_platform, count, end_date, enable_placement_group = specs[0]
330+
if _find_and_modify_existing_capacity_reservation(
331+
az_for_capacity_reservation, candidate_regions, count, end_date, instance_type, var, os_platform
332+
):
333+
continue
334+
if not _create_capacity_reservations(az_for_capacity_reservation, candidate_regions, specs, var):
335+
# If failed to create reservation, use use1-az6 to avoid making the test yaml syntactically wrong.
336+
logging.info("Failed to create capacity reservation for %s. Using use1-az6", var)
346337
az_for_capacity_reservation[var] = "use1-az6"
347338
return az_for_capacity_reservation
348339

349340

341+
def _create_capacity_reservations(az_for_cr, regions, specs, var): # noqa C901
342+
"""Find or create capacity reservations for multiple instance types in the same AZ."""
343+
for region in regions:
344+
try:
345+
ec2_client = boto3.client("ec2", region_name=region)
346+
for az in ec2_client.describe_availability_zones()["AvailabilityZones"]:
347+
if az["ZoneType"] != "availability-zone":
348+
continue
349+
zone_id = az["ZoneId"]
350+
created_capacity_reservation_ids = []
351+
success = True
352+
for instance_type, os_platform, count, end_date, enable_placement_group in specs:
353+
reservation_id = _create_single_capacity_reservation(
354+
zone_id, count, ec2_client, end_date, instance_type, os_platform, enable_placement_group
355+
)
356+
if reservation_id:
357+
created_capacity_reservation_ids.append(reservation_id)
358+
else:
359+
success = False
360+
break
361+
if success:
362+
az_for_cr[var] = zone_id
363+
logging.info("Created reservations for all instance types in %s", zone_id)
364+
return True
365+
for reservation_id in created_capacity_reservation_ids:
366+
try:
367+
logging.info(
368+
"Some instance types cannot be reserved in %s, cancelling back reservation %s",
369+
az,
370+
reservation_id,
371+
)
372+
ec2_client.cancel_capacity_reservation(CapacityReservationId=reservation_id)
373+
except Exception:
374+
pass
375+
except Exception as e:
376+
logging.info("Failed creating reservations in %s: %s", region, e)
377+
return False
378+
379+
350380
def _resolve_instance_type_and_os(instance_type, instance_type_parameters, os, os_parameters):
351381
if "INSTANCE_TYPE" in instance_type:
352382
# The value of the Jinja INSTANCE_TYPE variable can contain a size or not, e.g. trn1.32xlarge vs trn1.
@@ -386,50 +416,44 @@ def _parse_capacity_reservation_variable(var):
386416
return count, enable_placement_group, hours, instance_type, os
387417

388418

389-
def _create_capacity_reservation(
390-
az_for_capacity_reservation, count, ec2_client, end_date, instance_type, var, os_platform, enable_placement_group
419+
def _create_single_capacity_reservation(
420+
zone_id, count, ec2_client, end_date, instance_type, os_platform, enable_placement_group
391421
):
392-
capacity_reservation_created = False
393-
for availability_zone in ec2_client.describe_availability_zones()["AvailabilityZones"]:
394-
if availability_zone["ZoneType"] == "availability-zone":
422+
try:
423+
reservation_args = {
424+
"InstanceType": instance_type,
425+
"InstancePlatform": os_platform,
426+
"AvailabilityZoneId": zone_id,
427+
"InstanceCount": count,
428+
"EndDateType": "limited",
429+
"EndDate": end_date,
430+
"Tenancy": "default",
431+
}
432+
if enable_placement_group:
433+
placement_group_name = f"{instance_type}_placement_group_{zone_id}"
395434
try:
396-
zone_id = availability_zone["ZoneId"]
397-
reservation_args = {
398-
"InstanceType": instance_type,
399-
"InstancePlatform": os_platform,
400-
"AvailabilityZoneId": zone_id,
401-
"InstanceCount": count,
402-
"EndDateType": "limited",
403-
"EndDate": end_date,
404-
"Tenancy": "default",
405-
}
406-
if enable_placement_group:
407-
placement_group_name = f"{instance_type}_placement_group_{zone_id}"
408-
try:
409-
placement_group_arn = ec2_client.describe_placement_groups(GroupNames=[placement_group_name])[
410-
"PlacementGroups"
411-
][0]["GroupArn"]
412-
except Exception:
413-
placement_group_arn = ec2_client.create_placement_group(
414-
GroupName=placement_group_name, Strategy="cluster"
415-
)["PlacementGroup"]["GroupArn"]
416-
reservation_args["PlacementGroupArn"] = placement_group_arn
417-
ec2_client.create_capacity_reservation(**reservation_args)
418-
logging.info(
419-
"Capacity reservation for %s %s on %s created in %s", count, instance_type, os_platform, zone_id
420-
)
421-
capacity_reservation_created = True
422-
az_for_capacity_reservation[var] = zone_id
423-
break
424-
except Exception as e:
425-
logging.info(
426-
"Capacity reservation for %s %s failed to create in %s",
427-
count,
428-
instance_type,
429-
zone_id,
430-
)
431-
logging.info(e)
432-
return capacity_reservation_created
435+
placement_group_arn = ec2_client.describe_placement_groups(GroupNames=[placement_group_name])[
436+
"PlacementGroups"
437+
][0]["GroupArn"]
438+
except Exception:
439+
placement_group_arn = ec2_client.create_placement_group(
440+
GroupName=placement_group_name, Strategy="cluster"
441+
)["PlacementGroup"]["GroupArn"]
442+
reservation_args["PlacementGroupArn"] = placement_group_arn
443+
reservation_id = ec2_client.create_capacity_reservation(**reservation_args)["CapacityReservation"][
444+
"CapacityReservationId"
445+
]
446+
logging.info("Capacity reservation for %s %s on %s created in %s", count, instance_type, os_platform, zone_id)
447+
return reservation_id
448+
except Exception as e:
449+
logging.info(
450+
"Capacity reservation for %s %s failed to create in %s",
451+
count,
452+
instance_type,
453+
zone_id,
454+
)
455+
logging.info(e)
456+
return None
433457

434458

435459
def _find_and_modify_existing_capacity_reservation( # noqa: C901

0 commit comments

Comments
 (0)