Skip to content

Commit aff5df9

Browse files
ganeshkumarashokalexeldeib
authored andcommitted
Increase nvidia-smi timeout (#2901)
1 parent c301f93 commit aff5df9

File tree

106 files changed

+214
-214
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

106 files changed

+214
-214
lines changed

parts/linux/cloud-init/artifacts/cse_config.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ configGPUDrivers() {
557557
if [[ $OS == $UBUNTU_OS_NAME ]]; then
558558
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
559559
fi
560-
retrycmd_if_failure 120 5 25 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
560+
retrycmd_if_failure 120 5 60 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
561561
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
562562

563563
# reload containerd/dockerd
@@ -577,9 +577,9 @@ validateGPUDrivers() {
577577
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
578578
which nvidia-smi
579579
if [[ $? == 0 ]]; then
580-
SMI_RESULT=$(retrycmd_if_failure 24 5 25 nvidia-smi)
580+
SMI_RESULT=$(retrycmd_if_failure 24 5 60 nvidia-smi)
581581
else
582-
SMI_RESULT=$(retrycmd_if_failure 24 5 25 $GPU_DEST/bin/nvidia-smi)
582+
SMI_RESULT=$(retrycmd_if_failure 24 5 60 $GPU_DEST/bin/nvidia-smi)
583583
fi
584584
SMI_STATUS=$?
585585
if [[ $SMI_STATUS != 0 ]]; then

pkg/agent/testdata/AKSUbuntu1604+CustomKubeletConfig+CustomLinuxOSConfig/CustomData

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu1604+CustomKubeletConfig+CustomLinuxOSConfig/line70.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ configGPUDrivers() {
557557
if [[ $OS == $UBUNTU_OS_NAME ]]; then
558558
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
559559
fi
560-
retrycmd_if_failure 120 5 25 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
560+
retrycmd_if_failure 120 5 60 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
561561
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
562562

563563
# reload containerd/dockerd
@@ -577,9 +577,9 @@ validateGPUDrivers() {
577577
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
578578
which nvidia-smi
579579
if [[ $? == 0 ]]; then
580-
SMI_RESULT=$(retrycmd_if_failure 24 5 25 nvidia-smi)
580+
SMI_RESULT=$(retrycmd_if_failure 24 5 60 nvidia-smi)
581581
else
582-
SMI_RESULT=$(retrycmd_if_failure 24 5 25 $GPU_DEST/bin/nvidia-smi)
582+
SMI_RESULT=$(retrycmd_if_failure 24 5 60 $GPU_DEST/bin/nvidia-smi)
583583
fi
584584
SMI_STATUS=$?
585585
if [[ $SMI_STATUS != 0 ]]; then

pkg/agent/testdata/AKSUbuntu1604+CustomKubeletConfig+DynamicKubeletConfig/CustomData

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu1604+CustomKubeletConfig+DynamicKubeletConfig/line70.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ configGPUDrivers() {
557557
if [[ $OS == $UBUNTU_OS_NAME ]]; then
558558
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
559559
fi
560-
retrycmd_if_failure 120 5 25 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
560+
retrycmd_if_failure 120 5 60 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
561561
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
562562

563563
# reload containerd/dockerd
@@ -577,9 +577,9 @@ validateGPUDrivers() {
577577
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
578578
which nvidia-smi
579579
if [[ $? == 0 ]]; then
580-
SMI_RESULT=$(retrycmd_if_failure 24 5 25 nvidia-smi)
580+
SMI_RESULT=$(retrycmd_if_failure 24 5 60 nvidia-smi)
581581
else
582-
SMI_RESULT=$(retrycmd_if_failure 24 5 25 $GPU_DEST/bin/nvidia-smi)
582+
SMI_RESULT=$(retrycmd_if_failure 24 5 60 $GPU_DEST/bin/nvidia-smi)
583583
fi
584584
SMI_STATUS=$?
585585
if [[ $SMI_STATUS != 0 ]]; then

pkg/agent/testdata/AKSUbuntu1604+Disable1804SystemdResolved=false/CustomData

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu1604+Disable1804SystemdResolved=false/line70.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ configGPUDrivers() {
557557
if [[ $OS == $UBUNTU_OS_NAME ]]; then
558558
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
559559
fi
560-
retrycmd_if_failure 120 5 25 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
560+
retrycmd_if_failure 120 5 60 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
561561
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
562562

563563
# reload containerd/dockerd
@@ -577,9 +577,9 @@ validateGPUDrivers() {
577577
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
578578
which nvidia-smi
579579
if [[ $? == 0 ]]; then
580-
SMI_RESULT=$(retrycmd_if_failure 24 5 25 nvidia-smi)
580+
SMI_RESULT=$(retrycmd_if_failure 24 5 60 nvidia-smi)
581581
else
582-
SMI_RESULT=$(retrycmd_if_failure 24 5 25 $GPU_DEST/bin/nvidia-smi)
582+
SMI_RESULT=$(retrycmd_if_failure 24 5 60 $GPU_DEST/bin/nvidia-smi)
583583
fi
584584
SMI_STATUS=$?
585585
if [[ $SMI_STATUS != 0 ]]; then

pkg/agent/testdata/AKSUbuntu1604+Disable1804SystemdResolved=true/CustomData

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

pkg/agent/testdata/AKSUbuntu1604+Disable1804SystemdResolved=true/line70.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ configGPUDrivers() {
557557
if [[ $OS == $UBUNTU_OS_NAME ]]; then
558558
retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL
559559
fi
560-
retrycmd_if_failure 120 5 25 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
560+
retrycmd_if_failure 120 5 60 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL
561561
retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL
562562

563563
# reload containerd/dockerd
@@ -577,9 +577,9 @@ validateGPUDrivers() {
577577
retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL
578578
which nvidia-smi
579579
if [[ $? == 0 ]]; then
580-
SMI_RESULT=$(retrycmd_if_failure 24 5 25 nvidia-smi)
580+
SMI_RESULT=$(retrycmd_if_failure 24 5 60 nvidia-smi)
581581
else
582-
SMI_RESULT=$(retrycmd_if_failure 24 5 25 $GPU_DEST/bin/nvidia-smi)
582+
SMI_RESULT=$(retrycmd_if_failure 24 5 60 $GPU_DEST/bin/nvidia-smi)
583583
fi
584584
SMI_STATUS=$?
585585
if [[ $SMI_STATUS != 0 ]]; then

pkg/agent/testdata/AKSUbuntu1604+DynamicKubeletConfig/CustomData

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)