@@ -20,11 +20,15 @@ DOCKER_CE_PACKAGE_SLES="docker-${DOCKER_CE_VERSION_SLES}_ce-257.3"
2020VMBUS_DEVICES_PATH=/sys/bus/vmbus/devices
2121GEN1_VMBUS_DEVICE_PREFIX=" {00000000-0001-"
2222GEN2_VMBUS_DEVICE_ID=" {f8b3781a-1e82-4818-a1c3-63d806ec15bb}"
23+ IB_SM_DEVICE=/dev/infiniband/issm0
24+ IB_UVERBS_DEVICE=/dev/infiniband/uverbs0
2325MOUNTS_PATH=$AZ_BATCH_NODE_ROOT_DIR /mounts
2426VOLATILE_PATH=$AZ_BATCH_NODE_ROOT_DIR /volatile
2527IB_PKEY_FILE=$AZ_BATCH_TASK_WORKING_DIR /IB_PKEY
2628UCX_IB_PKEY_FILE=$AZ_BATCH_TASK_WORKING_DIR /UCX_IB_PKEY
27- SHIPYARD_IMAGE_PREFIX=mcr.microsoft.com/azure-batch/shipyard
29+ MCR_REPO=mcr.microsoft.com
30+ BLOBXFER_IMAGE_PREFIX=${MCR_REPO} /blobxfer
31+ SHIPYARD_IMAGE_PREFIX=${MCR_REPO} /azure-batch/shipyard
2832
2933# status file consts
3034lisinstalled=${VOLATILE_PATH} /.batch_shipyard_lis_installed
@@ -382,10 +386,38 @@ get_vm_size_from_imds() {
382386 if [[ " $vm_size " =~ ^standard_(( hb| hc)[0 - 9 ]+ m? rs? (_v[1 - 9 ])? )$ ]]; then
383387 # SR-IOV RDMA
384388 vm_rdma_type=1
389+ elif [[ "$vm_size " =~ ^standard_(nc[0 -9 ]+rs_v3 )$ ]]; then
390+ # SR-IOV RDMA (transition)
391+ vm_rdma_type=1
385392 elif [[ "$vm_size " =~ ^standard_((a8 |a9 )|((h|nc|nd)[0 -9 ]+m?rs?(_v[1 -3 ])?)) $ ]]; then
386393 # network direct RDMA
387394 vm_rdma_type=2
388395 fi
396+ # perform device-based overrides
397+ if [ -e " $IB_SM_DEVICE " ]; then
398+ if [ $vm_rdma_type -ne 1 ]; then
399+ log INFO " Changing RDMA type from $vm_rdma_type to SR-IOV for VM size $vm_size as $IB_SM_DEVICE was found."
400+ fi
401+ vm_rdma_type=1
402+ elif [ -e " $IB_UVERBS_DEVICE " ]; then
403+ if [ $vm_rdma_type -ne 2 ]; then
404+ log INFO " Changing RDMA type from $vm_rdma_type to Network Direct for VM size $vm_size as $IB_UVERBS_DEVICE was found."
405+ fi
406+ vm_rdma_type=2
407+ fi
408+ # validate setting
409+ if [ $vm_rdma_type -eq 1 ]; then
410+ if [ ! -e " $IB_SM_DEVICE " ]; then
411+ log ERROR " Expected IB device not found for VM size $vm_size : $IB_SM_DEVICE "
412+ exit 1
413+ fi
414+ fi
415+ if [ $vm_rdma_type -ne 0 ]; then
416+ if [ ! -e " $IB_UVERBS_DEVICE " ]; then
417+ log ERROR " Expected IB device not found for VM size $vm_size : $IB_UVERBS_DEVICE "
418+ exit 1
419+ fi
420+ fi
389421 log INFO " VmSize=$vm_size RDMA=$vm_rdma_type "
390422}
391423
@@ -712,12 +744,10 @@ install_kernel_devel_package() {
712744 set -e
713745 local centos_ver
714746 centos_ver=$( cut -d' ' -f 4 /etc/centos-release)
715- if [ -e /dev/infiniband/uverbs0 ]; then
716- if [ " $vm_rdma_type " -ne 0 ]; then
717- # HPC distros have pinned repos
718- install_packages " ${kernel_devel_package} "
719- installed=1
720- fi
747+ if [ " $vm_rdma_type " -ne 0 ]; then
748+ # HPC distros have pinned repos
749+ install_packages " ${kernel_devel_package} "
750+ installed=1
721751 fi
722752 if [ " $installed " -eq 0 ]; then
723753 if [[ " $centos_ver " == 7.3.* ]] || [[ " $centos_ver " == 7.4.* ]] || [[ " $centos_ver " == 7.5.* ]] || [[ " $centos_ver " == 7.6.* ]]; then
@@ -1526,8 +1556,7 @@ install_intel_mpi() {
15261556 log DEBUG " Not installing Intel MPI due to custom image"
15271557 return
15281558 fi
1529- if [ -e /dev/infiniband/uverbs0 ]; then
1530- log INFO " IB device found"
1559+ if [ " $vm_rdma_type " -eq 2 ]; then
15311560 if [ ! -d /opt/intel/compilers_and_libraries/linux/mpi ]; then
15321561 log DEBUG " Installing Intel MPI"
15331562 if [[ " $DISTRIB_ID " == sles* ]]; then
@@ -1562,7 +1591,7 @@ install_intel_mpi() {
15621591 exit 1
15631592 fi
15641593 else
1565- log INFO " IB device not found "
1594+ log INFO " Not installing Intel MPI for RDMA type: $vm_rdma_type "
15661595 fi
15671596}
15681597
@@ -1602,14 +1631,6 @@ check_for_mellanox_card() {
16021631 if [ " $vm_rdma_type " -eq 1 ]; then
16031632 log ERROR " Expected Mellanox IB card not detected"
16041633 exit 1
1605- elif [ " $vm_rdma_type " -eq 2 ]; then
1606- # check for ib device
1607- if [ -e /dev/infiniband/uverbs0 ]; then
1608- log INFO " IB device detected"
1609- else
1610- log ERROR " Expected IB device not detected"
1611- exit 1
1612- fi
16131634 fi
16141635 fi
16151636 if [ " $vm_rdma_type " -eq 1 ]; then
@@ -1733,7 +1754,7 @@ install_and_start_node_exporter() {
17331754 local ib
17341755 local nfs
17351756 nfs=" --no-collector.nfs"
1736- if [ -e /dev/infiniband/uverbs0 ]; then
1757+ if [ $vm_rdma_type -eq 1 ]; then
17371758 ib=" --collector.infiniband"
17381759 else
17391760 ib=" --no-collector.infiniband"
@@ -2001,7 +2022,7 @@ if [ $native_mode -eq 0 ] || [ $delay_preload -eq 1 ]; then
20012022fi
20022023
20032024# retrieve required docker images
2004- docker_pull_image " mcr.microsoft.com/blobxfer :${blobxferversion} "
2025+ docker_pull_image " ${BLOBXFER_IMAGE_PREFIX} :${blobxferversion} "
20052026docker_pull_image " ${SHIPYARD_IMAGE_PREFIX} :${shipyardversion} -cargo"
20062027
20072028# install container runtimes
0 commit comments