Skip to content
This repository was archived by the owner on Mar 20, 2023. It is now read-only.

Commit 79a1a45

Browse files
committed
STANDARD_NCv3 SR-IOV IB/RDMA transition
1 parent 649b116 commit 79a1a45

File tree

3 files changed

+62
-24
lines changed

3 files changed

+62
-24
lines changed

convoy/fleet.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,6 +1124,12 @@ def _construct_pool_object(
11241124
config, vm_config=pool_settings.vm_configuration)
11251125
is_windows = settings.is_windows_pool(
11261126
config, vm_config=pool_settings.vm_configuration)
1127+
logger.debug(
1128+
'pool vm_size={} ib(nd={} sriov={}) native={} windows={}'.format(
1129+
pool_settings.vm_size,
1130+
settings.is_networkdirect_rdma_pool(pool_settings.vm_size),
1131+
settings.is_sriov_rdma_pool(pool_settings.vm_size),
1132+
native, is_windows))
11271133
# get autoscale settings
11281134
if settings.is_pool_autoscale_enabled(config, pas=pool_settings.autoscale):
11291135
asenable = True
@@ -2606,7 +2612,7 @@ def _adjust_settings_for_pool_creation(config):
26062612
publisher = settings.pool_publisher(config, lower=True)
26072613
offer = settings.pool_offer(config, lower=True)
26082614
sku = settings.pool_sku(config, lower=True)
2609-
node_agent = settings.pool_custom_image_node_agent(config).lower()
2615+
node_agent = settings.pool_custom_image_node_agent(config)
26102616
if util.is_not_empty(node_agent) and util.is_not_empty(sku):
26112617
raise ValueError(
26122618
'cannot specify both a platform_image and a custom_image in the '
@@ -2655,7 +2661,7 @@ def _adjust_settings_for_pool_creation(config):
26552661
sku == 'datacenter-core-1809-with-containers-smalldisk'):
26562662
allowed = True
26572663
if (util.is_not_empty(node_agent) and
2658-
node_agent.startswith('batch.node.ubuntu')):
2664+
node_agent.lower().startswith('batch.node.ubuntu')):
26592665
shipyard_container_required = False
26602666
# check if allowed for gpu (if gpu vm size)
26612667
if allowed:

convoy/settings.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,11 @@
7676
r'^standard_((hb|hc)[\d]+m?rs?(_v[\d])?)$',
7777
re.IGNORECASE
7878
)
79+
_SRIOV_RDMA_TRANSITION_INSTANCES = re.compile(
80+
# standard nc+r_v3
81+
r'^standard_(nc[\d]+rs_v3)$',
82+
re.IGNORECASE
83+
)
7984
_NETWORKDIRECT_RDMA_INSTANCES = re.compile(
8085
# standard a8/a9, h+r, nc+r, nd+r
8186
r'^standard_((a8|a9)|((h|nc|nd)[\d]+m?rs?(_v[1-3])?))(_promo)?$',
@@ -879,7 +884,10 @@ def is_sriov_rdma_pool(vm_size):
879884
:rtype: bool
880885
:return: if sriov rdma is present
881886
"""
882-
return _SRIOV_RDMA_INSTANCES.match(vm_size) is not None
887+
return (
888+
_SRIOV_RDMA_INSTANCES.match(vm_size) is not None or
889+
_SRIOV_RDMA_TRANSITION_INSTANCES.match(vm_size) is not None
890+
)
883891

884892

885893
def is_networkdirect_rdma_pool(vm_size):
@@ -889,7 +897,10 @@ def is_networkdirect_rdma_pool(vm_size):
889897
:rtype: bool
890898
:return: if network direct rdma is present
891899
"""
892-
return _NETWORKDIRECT_RDMA_INSTANCES.match(vm_size) is not None
900+
return (
901+
_NETWORKDIRECT_RDMA_INSTANCES.match(vm_size) is not None and
902+
_SRIOV_RDMA_TRANSITION_INSTANCES.match(vm_size) is None
903+
)
893904

894905

895906
def is_rdma_pool(vm_size):

scripts/shipyard_nodeprep.sh

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,15 @@ DOCKER_CE_PACKAGE_SLES="docker-${DOCKER_CE_VERSION_SLES}_ce-257.3"
2020
VMBUS_DEVICES_PATH=/sys/bus/vmbus/devices
2121
GEN1_VMBUS_DEVICE_PREFIX="{00000000-0001-"
2222
GEN2_VMBUS_DEVICE_ID="{f8b3781a-1e82-4818-a1c3-63d806ec15bb}"
23+
IB_SM_DEVICE=/dev/infiniband/issm0
24+
IB_UVERBS_DEVICE=/dev/infiniband/uverbs0
2325
MOUNTS_PATH=$AZ_BATCH_NODE_ROOT_DIR/mounts
2426
VOLATILE_PATH=$AZ_BATCH_NODE_ROOT_DIR/volatile
2527
IB_PKEY_FILE=$AZ_BATCH_TASK_WORKING_DIR/IB_PKEY
2628
UCX_IB_PKEY_FILE=$AZ_BATCH_TASK_WORKING_DIR/UCX_IB_PKEY
27-
SHIPYARD_IMAGE_PREFIX=mcr.microsoft.com/azure-batch/shipyard
29+
MCR_REPO=mcr.microsoft.com
30+
BLOBXFER_IMAGE_PREFIX=${MCR_REPO}/blobxfer
31+
SHIPYARD_IMAGE_PREFIX=${MCR_REPO}/azure-batch/shipyard
2832

2933
# status file consts
3034
lisinstalled=${VOLATILE_PATH}/.batch_shipyard_lis_installed
@@ -382,10 +386,38 @@ get_vm_size_from_imds() {
382386
if [[ "$vm_size" =~ ^standard_((hb|hc)[0-9]+m?rs?(_v[1-9])?)$ ]]; then
383387
# SR-IOV RDMA
384388
vm_rdma_type=1
389+
elif [[ "$vm_size" =~ ^standard_(nc[0-9]+rs_v3)$ ]]; then
390+
# SR-IOV RDMA (transition)
391+
vm_rdma_type=1
385392
elif [[ "$vm_size" =~ ^standard_((a8|a9)|((h|nc|nd)[0-9]+m?rs?(_v[1-3])?))$ ]]; then
386393
# network direct RDMA
387394
vm_rdma_type=2
388395
fi
396+
# perform device-based overrides
397+
if [ -e "$IB_SM_DEVICE" ]; then
398+
if [ $vm_rdma_type -ne 1 ]; then
399+
log INFO "Changing RDMA type from $vm_rdma_type to SR-IOV for VM size $vm_size as $IB_SM_DEVICE was found."
400+
fi
401+
vm_rdma_type=1
402+
elif [ -e "$IB_UVERBS_DEVICE" ]; then
403+
if [ $vm_rdma_type -ne 2 ]; then
404+
log INFO "Changing RDMA type from $vm_rdma_type to Network Direct for VM size $vm_size as $IB_UVERBS_DEVICE was found."
405+
fi
406+
vm_rdma_type=2
407+
fi
408+
# validate setting
409+
if [ $vm_rdma_type -eq 1 ]; then
410+
if [ ! -e "$IB_SM_DEVICE" ]; then
411+
log ERROR "Expected IB device not found for VM size $vm_size: $IB_SM_DEVICE"
412+
exit 1
413+
fi
414+
fi
415+
if [ $vm_rdma_type -ne 0 ]; then
416+
if [ ! -e "$IB_UVERBS_DEVICE" ]; then
417+
log ERROR "Expected IB device not found for VM size $vm_size: $IB_UVERBS_DEVICE"
418+
exit 1
419+
fi
420+
fi
389421
log INFO "VmSize=$vm_size RDMA=$vm_rdma_type"
390422
}
391423

@@ -712,12 +744,10 @@ install_kernel_devel_package() {
712744
set -e
713745
local centos_ver
714746
centos_ver=$(cut -d' ' -f 4 /etc/centos-release)
715-
if [ -e /dev/infiniband/uverbs0 ]; then
716-
if [ "$vm_rdma_type" -ne 0 ]; then
717-
# HPC distros have pinned repos
718-
install_packages "${kernel_devel_package}"
719-
installed=1
720-
fi
747+
if [ "$vm_rdma_type" -ne 0 ]; then
748+
# HPC distros have pinned repos
749+
install_packages "${kernel_devel_package}"
750+
installed=1
721751
fi
722752
if [ "$installed" -eq 0 ]; then
723753
if [[ "$centos_ver" == 7.3.* ]] || [[ "$centos_ver" == 7.4.* ]] || [[ "$centos_ver" == 7.5.* ]] || [[ "$centos_ver" == 7.6.* ]]; then
@@ -1526,8 +1556,7 @@ install_intel_mpi() {
15261556
log DEBUG "Not installing Intel MPI due to custom image"
15271557
return
15281558
fi
1529-
if [ -e /dev/infiniband/uverbs0 ]; then
1530-
log INFO "IB device found"
1559+
if [ "$vm_rdma_type" -eq 2 ]; then
15311560
if [ ! -d /opt/intel/compilers_and_libraries/linux/mpi ]; then
15321561
log DEBUG "Installing Intel MPI"
15331562
if [[ "$DISTRIB_ID" == sles* ]]; then
@@ -1562,7 +1591,7 @@ install_intel_mpi() {
15621591
exit 1
15631592
fi
15641593
else
1565-
log INFO "IB device not found"
1594+
log INFO "Not installing Intel MPI for RDMA type: $vm_rdma_type"
15661595
fi
15671596
}
15681597

@@ -1602,14 +1631,6 @@ check_for_mellanox_card() {
16021631
if [ "$vm_rdma_type" -eq 1 ]; then
16031632
log ERROR "Expected Mellanox IB card not detected"
16041633
exit 1
1605-
elif [ "$vm_rdma_type" -eq 2 ]; then
1606-
# check for ib device
1607-
if [ -e /dev/infiniband/uverbs0 ]; then
1608-
log INFO "IB device detected"
1609-
else
1610-
log ERROR "Expected IB device not detected"
1611-
exit 1
1612-
fi
16131634
fi
16141635
fi
16151636
if [ "$vm_rdma_type" -eq 1 ]; then
@@ -1733,7 +1754,7 @@ install_and_start_node_exporter() {
17331754
local ib
17341755
local nfs
17351756
nfs="--no-collector.nfs"
1736-
if [ -e /dev/infiniband/uverbs0 ]; then
1757+
if [ $vm_rdma_type -eq 1 ]; then
17371758
ib="--collector.infiniband"
17381759
else
17391760
ib="--no-collector.infiniband"
@@ -2001,7 +2022,7 @@ if [ $native_mode -eq 0 ] || [ $delay_preload -eq 1 ]; then
20012022
fi
20022023

20032024
# retrieve required docker images
2004-
docker_pull_image "mcr.microsoft.com/blobxfer:${blobxferversion}"
2025+
docker_pull_image "${BLOBXFER_IMAGE_PREFIX}:${blobxferversion}"
20052026
docker_pull_image "${SHIPYARD_IMAGE_PREFIX}:${shipyardversion}-cargo"
20062027

20072028
# install container runtimes

0 commit comments

Comments
 (0)