Skip to content

Commit fea5ef0

Browse files
Alexander Eldeibalexeldeib
authored andcommitted
refactor: relax gpu validation timeout to 5 minutes
nvidia gpus take 1-3 seconds per card to load/initialize, for VM sizes with many GPUs, this can take more than 25 sec (our old timeout). We raised it to 60 seconds, but there is no real reason for this. We raise it arbitraily to 5 minutes here to avoid any tail latency issues, while we work toward a more stable/performant fix. there's additionally little reason for this to fail provisioning; sophisticated customers can fix it themselves post-creation.
1 parent aff5df9 commit fea5ef0

File tree

106 files changed

+214
-214
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

106 files changed

+214
-214
lines changed

parts/linux/cloud-init/artifacts/cse_config.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ configGPUDrivers() {
557557
if [[ $OS == $UBUNTU_OS_NAME ]]; then
558558
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
559559
fi
560-
retrycmd_if_failure 120 5 60 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
560+
retrycmd_if_failure 120 5 300 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
561561
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
562562

563563
# reload containerd/dockerd
@@ -577,9 +577,9 @@ validateGPUDrivers() {
577577
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
578578
which nvidia-smi
579579
if [[ $? == 0 ]]; then
580-
SMI_RESULT=$(retrycmd_if_failure 24 5 60 nvidia-smi)
580+
SMI_RESULT=$(retrycmd_if_failure 24 5 300 nvidia-smi)
581581
else
582-
SMI_RESULT=$(retrycmd_if_failure 24 5 60 $GPU_DEST/bin/nvidia-smi)
582+
SMI_RESULT=$(retrycmd_if_failure 24 5 300 $GPU_DEST/bin/nvidia-smi)
583583
fi
584584
SMI_STATUS=$?
585585
if [[ $SMI_STATUS != 0 ]]; then

pkg/agent/testdata/AKSUbuntu1604+CustomKubeletConfig+CustomLinuxOSConfig/CustomData

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu1604+CustomKubeletConfig+CustomLinuxOSConfig/line70.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ configGPUDrivers() {
557557
if [[ $OS == $UBUNTU_OS_NAME ]]; then
558558
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
559559
fi
560-
retrycmd_if_failure 120 5 60 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
560+
retrycmd_if_failure 120 5 300 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
561561
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
562562

563563
# reload containerd/dockerd
@@ -577,9 +577,9 @@ validateGPUDrivers() {
577577
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
578578
which nvidia-smi
579579
if [[ $? == 0 ]]; then
580-
SMI_RESULT=$(retrycmd_if_failure 24 5 60 nvidia-smi)
580+
SMI_RESULT=$(retrycmd_if_failure 24 5 300 nvidia-smi)
581581
else
582-
SMI_RESULT=$(retrycmd_if_failure 24 5 60 $GPU_DEST/bin/nvidia-smi)
582+
SMI_RESULT=$(retrycmd_if_failure 24 5 300 $GPU_DEST/bin/nvidia-smi)
583583
fi
584584
SMI_STATUS=$?
585585
if [[ $SMI_STATUS != 0 ]]; then

pkg/agent/testdata/AKSUbuntu1604+CustomKubeletConfig+DynamicKubeletConfig/CustomData

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu1604+CustomKubeletConfig+DynamicKubeletConfig/line70.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ configGPUDrivers() {
557557
if [[ $OS == $UBUNTU_OS_NAME ]]; then
558558
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
559559
fi
560-
retrycmd_if_failure 120 5 60 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
560+
retrycmd_if_failure 120 5 300 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
561561
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
562562

563563
# reload containerd/dockerd
@@ -577,9 +577,9 @@ validateGPUDrivers() {
577577
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
578578
which nvidia-smi
579579
if [[ $? == 0 ]]; then
580-
SMI_RESULT=$(retrycmd_if_failure 24 5 60 nvidia-smi)
580+
SMI_RESULT=$(retrycmd_if_failure 24 5 300 nvidia-smi)
581581
else
582-
SMI_RESULT=$(retrycmd_if_failure 24 5 60 $GPU_DEST/bin/nvidia-smi)
582+
SMI_RESULT=$(retrycmd_if_failure 24 5 300 $GPU_DEST/bin/nvidia-smi)
583583
fi
584584
SMI_STATUS=$?
585585
if [[ $SMI_STATUS != 0 ]]; then

pkg/agent/testdata/AKSUbuntu1604+Disable1804SystemdResolved=false/CustomData

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu1604+Disable1804SystemdResolved=false/line70.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ configGPUDrivers() {
557557
if [[ $OS == $UBUNTU_OS_NAME ]]; then
558558
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
559559
fi
560-
retrycmd_if_failure 120 5 60 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
560+
retrycmd_if_failure 120 5 300 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
561561
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
562562

563563
# reload containerd/dockerd
@@ -577,9 +577,9 @@ validateGPUDrivers() {
577577
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
578578
which nvidia-smi
579579
if [[ $? == 0 ]]; then
580-
SMI_RESULT=$(retrycmd_if_failure 24 5 60 nvidia-smi)
580+
SMI_RESULT=$(retrycmd_if_failure 24 5 300 nvidia-smi)
581581
else
582-
SMI_RESULT=$(retrycmd_if_failure 24 5 60 $GPU_DEST/bin/nvidia-smi)
582+
SMI_RESULT=$(retrycmd_if_failure 24 5 300 $GPU_DEST/bin/nvidia-smi)
583583
fi
584584
SMI_STATUS=$?
585585
if [[ $SMI_STATUS != 0 ]]; then

pkg/agent/testdata/AKSUbuntu1604+Disable1804SystemdResolved=true/CustomData

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu1604+Disable1804SystemdResolved=true/line70.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ configGPUDrivers() {
557557
if [[ $OS == $UBUNTU_OS_NAME ]]; then
558558
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
559559
fi
560-
retrycmd_if_failure 120 5 60 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
560+
retrycmd_if_failure 120 5 300 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
561561
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
562562

563563
# reload containerd/dockerd
@@ -577,9 +577,9 @@ validateGPUDrivers() {
577577
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
578578
which nvidia-smi
579579
if [[ $? == 0 ]]; then
580-
SMI_RESULT=$(retrycmd_if_failure 24 5 60 nvidia-smi)
580+
SMI_RESULT=$(retrycmd_if_failure 24 5 300 nvidia-smi)
581581
else
582-
SMI_RESULT=$(retrycmd_if_failure 24 5 60 $GPU_DEST/bin/nvidia-smi)
582+
SMI_RESULT=$(retrycmd_if_failure 24 5 300 $GPU_DEST/bin/nvidia-smi)
583583
fi
584584
SMI_STATUS=$?
585585
if [[ $SMI_STATUS != 0 ]]; then

pkg/agent/testdata/AKSUbuntu1604+DynamicKubeletConfig/CustomData

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)