aws · hgreebe · Jan 29, 2026 · Jan 23, 2026 · Jan 27, 2026 · Jan 27, 2026
@@ -8,6 +8,7 @@ CHANGELOG
 - Add validator that warns against the downsides of disabling in-place updates on compute and login nodes through DevSettings.
 - Upgrade jmespath to ~=1.0 (from ~=0.10).
 - Upgrade tabulate to <=0.9.0 (from <=0.8.10).
+- Add support for p6-b300 instances for all OSs except AL2.
 - Add permission `cloudwatch:PutMetricData` to the head node policy so that clustermgtd is able to emit metrics.
 - Add alarm on missing clustermgtd heartbeat.
 

@@ -39,6 +39,8 @@
 UNSUPPORTED_OSES_FOR_MICRO_NANO = ["ubuntu2204", "ubuntu2404", "rhel8", "rocky8", "rhel9", "rocky9"]
 UNSUPPORTED_OSES_FOR_P6E_GB200 = ["rhel8", "rocky8", "alinux2", "rhel9"]
 SUPPORTED_OSES_FOR_P6E_GB200 = list(set(SUPPORTED_OSES) - set(UNSUPPORTED_OSES_FOR_P6E_GB200))
+UNSUPPORTED_OSES_FOR_P6_B300 = ["alinux2"]
+SUPPORTED_OSES_FOR_P6_B300 = list(set(SUPPORTED_OSES) - set(UNSUPPORTED_OSES_FOR_P6_B300))
 UNSUPPORTED_OSES_FOR_DCV = []
 UNSUPPORTED_OSES_FOR_NON_GPU_DCV = ["rocky9", "rhel9"]
 UNSUPPORTED_ARM_OSES_FOR_DCV = []
@@ -352,6 +354,9 @@ class Operation(Enum):
 PCLUSTER_BUILD_IMAGE_CLEANUP_ROLE_BOOTSTRAP_TAG_KEY = "parallelcluster:build-image-cleanup-role-bootstrapped"
 
 P6E_GB200 = "p6e-gb200"
+P6_B300 = "p6-b300"
+INSTANCE_TYPES_WITH_FIRST_INTERFACE_ENA = [P6E_GB200, P6_B300]
+
 ULTRASERVER_INSTANCE_PREFIX_LIST = [P6E_GB200]
 # Dictionary mapping ultraserver instance prefixes to their allowed capacity block sizes
 ULTRASERVER_CAPACITY_BLOCK_ALLOWED_SIZE_DICT = {

@@ -11,8 +11,10 @@
 from pcluster.config.common import DefaultUserHomeType, SharedStorageType
 from pcluster.constants import (
     DEFAULT_EPHEMERAL_DIR,
+    INSTANCE_TYPES_WITH_FIRST_INTERFACE_ENA,
     NODE_BOOTSTRAP_TIMEOUT,
     OS_MAPPING,
+    P6_B300,
     P6E_GB200,
     PCLUSTER_COMPUTE_RESOURCE_NAME_TAG,
     PCLUSTER_QUEUE_NAME_TAG,
@@ -368,9 +370,12 @@ def add_network_interfaces(
     queue_lt_security_groups,
 ):
     """Generate launch template network interfaces list."""
-    is_gb200 = compute_resource.instance_types[0].split(".")[0] == P6E_GB200
+    instance_family = compute_resource.instance_types[0].split(".")[0]
+    is_gb200 = instance_family == P6E_GB200
+    is_b300 = instance_family == P6_B300
     efa_enabled = compute_resource.efa and compute_resource.efa.enabled
-    interface_type = "efa" if efa_enabled and not is_gb200 else None
+    # gb200 and b300 instances require the first interface to be ENA even if EFA is enabled
+    interface_type = "efa" if efa_enabled and instance_family not in INSTANCE_TYPES_WITH_FIRST_INTERFACE_ENA else None
 
     compute_lt_nw_interfaces = [
         ec2.CfnLaunchTemplate.NetworkInterfaceProperty(
@@ -390,10 +395,18 @@ def add_network_interfaces(
         if is_gb200 and not efa_enabled and not even:
             continue
 
-        interface_type = "efa" if efa_enabled else None
-        # if efa is enabled with a gb200 instance, even indexes are configured as efa and the odd as efa-only
-        if is_gb200 and efa_enabled:
-            interface_type = "efa" if even else "efa-only"
+        if efa_enabled:
+            if is_b300:
+                # if efa is enabled with a b300 instance, all network cards, except for the primary,
+                # are configured as efa-only
+                interface_type = "efa-only"
+            elif is_gb200:
+                # if efa is enabled with a gb200 instance, even indexes are configured as efa and the odd as efa-only
+                interface_type = "efa" if even else "efa-only"
+            else:
+                interface_type = "efa"
+        else:
+            interface_type = None
 
         compute_lt_nw_interfaces.append(
             ec2.CfnLaunchTemplate.NetworkInterfaceProperty(

@@ -23,10 +23,13 @@
     CAPACITY_BLOCK_INACTIVE_STATES,
     CAPACITY_RESERVATION_OS_MAP,
     NVIDIA_OPENRM_UNSUPPORTED_INSTANCE_TYPES,
+    P6_B300,
+    SUPPORTED_OSES_FOR_P6_B300,
     SUPPORTED_OSES_FOR_P6E_GB200,
     ULTRASERVER_CAPACITY_BLOCK_ALLOWED_SIZE_DICT,
     ULTRASERVER_INSTANCE_PREFIX_LIST,
     UNSUPPORTED_OSES_FOR_MICRO_NANO,
+    UNSUPPORTED_OSES_FOR_P6_B300,
     UNSUPPORTED_OSES_FOR_P6E_GB200,
 )
 from pcluster.utils import get_needed_ultraserver_capacity_block_statuses, get_resource_name_from_resource_arn
@@ -211,6 +214,12 @@ def _validate(self, instance_type: str, os: str):
                     " Please use one of the following OS: {2}".format(instance_type, os, SUPPORTED_OSES_FOR_P6E_GB200),
                     FailureLevel.ERROR,
                 )
+        if instance_type.startswith(P6_B300) and os in UNSUPPORTED_OSES_FOR_P6_B300:
+            self._add_failure(
+                "The instance type {0} is not supported with OS {1}."
+                " Please use one of the following OS: {2}".format(instance_type, os, SUPPORTED_OSES_FOR_P6_B300),
+                FailureLevel.ERROR,
+            )
 
 
 class KeyPairValidator(Validator):

@@ -221,6 +221,28 @@ def maximum_network_interfaces(self):
                 {"network_card_index": 2, "interface_type": None, "device_index": 1},
             ],
         ),
+        (
+            True,
+            "p6-b300.WHATEVER_SIZE",
+            [NetworkCard(0, 4), NetworkCard(1, 4), NetworkCard(2, 4), NetworkCard(3, 4), NetworkCard(4, 4)],
+            [
+                {"network_card_index": 0, "interface_type": None, "device_index": 0},
+                {"network_card_index": 1, "interface_type": "efa-only", "device_index": 1},
+                {"network_card_index": 2, "interface_type": "efa-only", "device_index": 1},
+                {"network_card_index": 3, "interface_type": "efa-only", "device_index": 1},
+                {"network_card_index": 4, "interface_type": "efa-only", "device_index": 1},
+            ],
+        ),
+        (
+            False,
+            "p6-b300.WHATEVER_SIZE",
+            [NetworkCard(0, 4), NetworkCard(1, 4), NetworkCard(2, 4)],
+            [
+                {"network_card_index": 0, "interface_type": None, "device_index": 0},
+                {"network_card_index": 1, "interface_type": None, "device_index": 1},
+                {"network_card_index": 2, "interface_type": None, "device_index": 1},
+            ],
+        ),
     ],
 )
 def test_add_compute_resource_launch_template(

@@ -461,6 +461,17 @@ def test_instance_type_base_ami_compatible_validator(
             "rocky9",
             None,
         ),
+        (
+            "p6-b300.WHATEVER_SIZE",
+            "alinux2",
+            "The instance type p6-b300.WHATEVER_SIZE is not supported with OS alinux2. "
+            "Please use one of the following OS",
+        ),
+        (
+            "p6-b300.WHATEVER_SIZE",
+            "alinux2023",
+            None,
+        ),
     ],
 )
 def test_instance_type_os_compatible_validator(instance_type, os, expected_message):

@@ -6,7 +6,7 @@ rm -rf /shared/${1}
 
 module load ${1}
 NCCL_BENCHMARKS_VERSION='2.17.1'
-NCCL_VERSION='2.28.3-1'
+NCCL_VERSION='2.28.9-1'
 MPI_HOME=$(which mpirun | awk -F '/bin' '{print $1}')
 NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_90,code=sm_90 -gencode=arch=compute_90,code=compute_90" # Arch for NVIDIA A100 and H100, ref https://docs.nvidia.com/cuda/ada-compatibility-guide/index.html
 
@@ -28,8 +28,8 @@ NVCC_GENCODE="${NVCC_GENCODE}" make MPI=1 MPI_HOME=${MPI_HOME} NCCL_HOME=/shared
 
 # Compile OFI NCCL plugin for RHEL and Rocky because EFA doesn't ship the plugin on the OSes
 . /etc/os-release
-if [[ $ID==rhel || $ID==rocky ]]; then
-  OFI_NCCL_VERSION='1.16.3'
+if [[ "$ID" == "rhel" || "$ID" == "rocky" ]]; then
+  OFI_NCCL_VERSION='1.18.0'
   wget https://github.com/aws/aws-ofi-nccl/archive/v${OFI_NCCL_VERSION}.tar.gz
   tar xvfz v${OFI_NCCL_VERSION}.tar.gz
   cd aws-ofi-nccl-${OFI_NCCL_VERSION}

@@ -3,7 +3,7 @@
 #SBATCH --exclusive
 
 module load openmpi
-NCCL_VERSION='2.28.3-1'
+NCCL_VERSION='2.28.9-1'
 NCCL_BENCHMARKS_VERSION='2.17.1'
 
 . /etc/os-release

@@ -71,6 +71,7 @@ def install_and_run_nccl_benchmarks(remote_command_executor, mpi_module, schedul
         # p5.48xlarge - Expected "in-place busbw" bandwidth with 2 nodes, 8 tasks per node is about 250GB/s
         "p5.48xlarge": 250.0,
         "p6-b200.48xlarge": 570,  # Initial testing performance 631.17
+        "p6-b300.48xlarge": 675,  # Initial testing performance 698.7
         "p6e-gb200.36xlarge": 650,  # Initial testing performance 719.17
     }
 

@@ -61,11 +61,13 @@ def test_efa(
         # when the instance type is available in open capacity pool
         head_node_instance = instance
     max_queue_size = 2
-    p6_b200_capacity_reservation_id = None
-    if instance == "p6-b200.48xlarge":
+    capacity_reservation_id = None
+    # p family instances need capacity blocks and so placement group is set to false
+    placement_group_enabled = not instance.startswith("p")
+    if instance in ("p6-b200.48xlarge", "p6-b300.48xlarge"):
         capacity_reservations_ids = get_capacity_reservation_id(request, instance, region, max_queue_size, os)
         if capacity_reservations_ids:
-            p6_b200_capacity_reservation_id = capacity_reservations_ids[0].get("CapacityReservationId")
+            capacity_reservation_id = capacity_reservations_ids[0].get("CapacityReservationId")
         else:
             message = f"Skipping the test as no Capacity Block for {instance} and os {os} was found in {region}"
             logging.warn(message)
@@ -75,7 +77,8 @@ def test_efa(
     cluster_config = pcluster_config_reader(
         head_node_instance=head_node_instance,
         max_queue_size=max_queue_size,
-        p6_b200_capacity_reservation_id=p6_b200_capacity_reservation_id,
+        capacity_reservation_id=capacity_reservation_id,
+        placement_group_enabled=placement_group_enabled,
     )
     cluster = clusters_factory(cluster_config)
     remote_command_executor = RemoteCommandExecutor(cluster)

@@ -8,7 +8,7 @@ set -ex
 FABTESTS_DIR="$1"
 
 FABTESTS_REPO="https://github.com/ofiwg/libfabric.git"
-FABTESTS_VERSION="2.3.0"
+FABTESTS_VERSION="2.4.0"
 FABTESTS_SOURCES_DIR="$FABTESTS_DIR/sources"
 LIBFABRIC_DIR="/opt/amazon/efa"
 CUDA_DIR="/usr/local/cuda"

@@ -13,14 +13,14 @@ Scheduling:
   Scheduler: {{ scheduler }}
   SlurmQueues:
     - Name: efa-enabled
-      {% if p6_b200_capacity_reservation_id %}
+      {% if capacity_reservation_id %}
       CapacityType: CAPACITY_BLOCK
       CapacityReservationTarget:
-        CapacityReservationId: {{ p6_b200_capacity_reservation_id }}
+        CapacityReservationId: {{ capacity_reservation_id }}
       {% endif %}
       Networking:
         PlacementGroup:
-          Enabled: {% if instance not in ["p4d.24xlarge", "p6-b200.48xlarge"] %}true{% else %}false{% endif %}
+          Enabled: {{ placement_group_enabled }}
           {% if instance in ["c5n.18xlarge", "c6gn.16xlarge"] %}Name: {{ capacity_reservation_framework_placement_group }}{% endif %}
         SubnetIds:
           - {{ private_subnet_id }}