@@ -313,40 +313,70 @@ def _check_or_create_capacity_reservations(config_file, os_parameters, instance_
313313 for var in variables :
314314 if "CAPACITY_RESERVATION" in var :
315315 logging .info ("Checking capacity reservation for %s" , var )
316- count , enable_placement_group , hours , instance_type , os = _parse_capacity_reservation_variable (var )
317- instance_type , os_platform = _resolve_instance_type_and_os (
318- instance_type , instance_type_parameters , os , os_parameters
319- )
320- end_date = datetime .now (timezone .utc ) + timedelta (hours = hours )
316+ specs = []
317+ for part in var .split ("__" ): # Support multiple instance types separated by __
318+ count , enable_placement_group , hours , instance_type , os = _parse_capacity_reservation_variable (part )
319+ instance_type , os_platform = _resolve_instance_type_and_os (
320+ instance_type , instance_type_parameters , os , os_parameters
321+ )
322+ end_date = datetime .now (timezone .utc ) + timedelta (hours = hours )
323+ specs .append ((instance_type , os_platform , count , end_date , enable_placement_group ))
321324 candidate_regions = ["us-east-1" , "us-east-2" , "us-west-2" , "eu-west-1" ]
322- if _find_and_modify_existing_capacity_reservation (
323- az_for_capacity_reservation , candidate_regions , count , end_date , instance_type , var , os_platform
324- ):
325- continue
326- capacity_reservation_created = False
327- try :
328- for region in candidate_regions :
329- ec2_client = boto3 .client ("ec2" , region_name = region )
330- capacity_reservation_created = _create_capacity_reservation (
331- az_for_capacity_reservation ,
332- count ,
333- ec2_client ,
334- end_date ,
335- instance_type ,
336- var ,
337- os_platform ,
338- enable_placement_group ,
339- )
340- if capacity_reservation_created :
341- break
342- except Exception :
343- az_for_capacity_reservation [var ] = "use1-az6"
344- if not capacity_reservation_created :
345- # Assign arbitrary zone if no reservations can be made
325+ if len (specs ) == 1 :
326+ # Single instance type reservation: check for existing reservation to be frugal.
327+ # It is hard to implement such logic for multiple instance types reservation.
328+ # Therefore, for multiple instance types, skip the logic and make reservation directly.
329+ instance_type , os_platform , count , end_date , enable_placement_group = specs [0 ]
330+ if _find_and_modify_existing_capacity_reservation (
331+ az_for_capacity_reservation , candidate_regions , count , end_date , instance_type , var , os_platform
332+ ):
333+ continue
334+ if not _create_capacity_reservations (az_for_capacity_reservation , candidate_regions , specs , var ):
335+ # If failed to create reservation, use use1-az6 to avoid making the test yaml syntactically wrong.
336+ logging .info ("Failed to create capacity reservation for %s. Using use1-az6" , var )
346337 az_for_capacity_reservation [var ] = "use1-az6"
347338 return az_for_capacity_reservation
348339
349340
341+ def _create_capacity_reservations (az_for_cr , regions , specs , var ): # noqa C901
342+ """Find or create capacity reservations for multiple instance types in the same AZ."""
343+ for region in regions :
344+ try :
345+ ec2_client = boto3 .client ("ec2" , region_name = region )
346+ for az in ec2_client .describe_availability_zones ()["AvailabilityZones" ]:
347+ if az ["ZoneType" ] != "availability-zone" :
348+ continue
349+ zone_id = az ["ZoneId" ]
350+ created_capacity_reservation_ids = []
351+ success = True
352+ for instance_type , os_platform , count , end_date , enable_placement_group in specs :
353+ reservation_id = _create_single_capacity_reservation (
354+ zone_id , count , ec2_client , end_date , instance_type , os_platform , enable_placement_group
355+ )
356+ if reservation_id :
357+ created_capacity_reservation_ids .append (reservation_id )
358+ else :
359+ success = False
360+ break
361+ if success :
362+ az_for_cr [var ] = zone_id
363+ logging .info ("Created reservations for all instance types in %s" , zone_id )
364+ return True
365+ for reservation_id in created_capacity_reservation_ids :
366+ try :
367+ logging .info (
368+ "Some instance types cannot be reserved in %s, cancelling back reservation %s" ,
369+ az ,
370+ reservation_id ,
371+ )
372+ ec2_client .cancel_capacity_reservation (CapacityReservationId = reservation_id )
373+ except Exception :
374+ pass
375+ except Exception as e :
376+ logging .info ("Failed creating reservations in %s: %s" , region , e )
377+ return False
378+
379+
350380def _resolve_instance_type_and_os (instance_type , instance_type_parameters , os , os_parameters ):
351381 if "INSTANCE_TYPE" in instance_type :
352382 # The value of the Jinja INSTANCE_TYPE variable can contain a size or not, e.g. trn1.32xlarge vs trn1.
@@ -386,50 +416,44 @@ def _parse_capacity_reservation_variable(var):
386416 return count , enable_placement_group , hours , instance_type , os
387417
388418
389- def _create_capacity_reservation (
390- az_for_capacity_reservation , count , ec2_client , end_date , instance_type , var , os_platform , enable_placement_group
419+ def _create_single_capacity_reservation (
420+ zone_id , count , ec2_client , end_date , instance_type , os_platform , enable_placement_group
391421):
392- capacity_reservation_created = False
393- for availability_zone in ec2_client .describe_availability_zones ()["AvailabilityZones" ]:
394- if availability_zone ["ZoneType" ] == "availability-zone" :
422+ try :
423+ reservation_args = {
424+ "InstanceType" : instance_type ,
425+ "InstancePlatform" : os_platform ,
426+ "AvailabilityZoneId" : zone_id ,
427+ "InstanceCount" : count ,
428+ "EndDateType" : "limited" ,
429+ "EndDate" : end_date ,
430+ "Tenancy" : "default" ,
431+ }
432+ if enable_placement_group :
433+ placement_group_name = f"{ instance_type } _placement_group_{ zone_id } "
395434 try :
396- zone_id = availability_zone ["ZoneId" ]
397- reservation_args = {
398- "InstanceType" : instance_type ,
399- "InstancePlatform" : os_platform ,
400- "AvailabilityZoneId" : zone_id ,
401- "InstanceCount" : count ,
402- "EndDateType" : "limited" ,
403- "EndDate" : end_date ,
404- "Tenancy" : "default" ,
405- }
406- if enable_placement_group :
407- placement_group_name = f"{ instance_type } _placement_group_{ zone_id } "
408- try :
409- placement_group_arn = ec2_client .describe_placement_groups (GroupNames = [placement_group_name ])[
410- "PlacementGroups"
411- ][0 ]["GroupArn" ]
412- except Exception :
413- placement_group_arn = ec2_client .create_placement_group (
414- GroupName = placement_group_name , Strategy = "cluster"
415- )["PlacementGroup" ]["GroupArn" ]
416- reservation_args ["PlacementGroupArn" ] = placement_group_arn
417- ec2_client .create_capacity_reservation (** reservation_args )
418- logging .info (
419- "Capacity reservation for %s %s on %s created in %s" , count , instance_type , os_platform , zone_id
420- )
421- capacity_reservation_created = True
422- az_for_capacity_reservation [var ] = zone_id
423- break
424- except Exception as e :
425- logging .info (
426- "Capacity reservation for %s %s failed to create in %s" ,
427- count ,
428- instance_type ,
429- zone_id ,
430- )
431- logging .info (e )
432- return capacity_reservation_created
435+ placement_group_arn = ec2_client .describe_placement_groups (GroupNames = [placement_group_name ])[
436+ "PlacementGroups"
437+ ][0 ]["GroupArn" ]
438+ except Exception :
439+ placement_group_arn = ec2_client .create_placement_group (
440+ GroupName = placement_group_name , Strategy = "cluster"
441+ )["PlacementGroup" ]["GroupArn" ]
442+ reservation_args ["PlacementGroupArn" ] = placement_group_arn
443+ reservation_id = ec2_client .create_capacity_reservation (** reservation_args )["CapacityReservation" ][
444+ "CapacityReservationId"
445+ ]
446+ logging .info ("Capacity reservation for %s %s on %s created in %s" , count , instance_type , os_platform , zone_id )
447+ return reservation_id
448+ except Exception as e :
449+ logging .info (
450+ "Capacity reservation for %s %s failed to create in %s" ,
451+ count ,
452+ instance_type ,
453+ zone_id ,
454+ )
455+ logging .info (e )
456+ return None
433457
434458
435459def _find_and_modify_existing_capacity_reservation ( # noqa: C901
0 commit comments