Skip to content

Commit 3b05e16

Browse files
authored
Merge pull request #1864 from ROCm/1846-jenkins-add-a-timeout-around-gpu-reset-functionality
Added 5min timeout to the resetGPUs() func
2 parents 820fc94 + fed0600 commit 3b05e16

File tree

1 file changed

+26
-24
lines changed

1 file changed

+26
-24
lines changed

mlir/utils/jenkins/Jenkinsfile

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -66,35 +66,37 @@ def advancedNodeCheck(Map params) {
6666
}
6767

6868
def resetGPUs() {
69-
// Run the reset, but don't fail the build if anything is wrong
70-
def rc = sh(
71-
script: '''
72-
reset_all_gpus() {
73-
echo "Scanning GPUs..."
74-
GPU_IDS=$(rocm-smi | awk '/^[0-9]+[[:space:]]+[0-9]+[[:space:]]+0x/ { print $1 }')
75-
if [ -z "$GPU_IDS" ]; then
76-
echo "WARNING: No GPUs found to reset."
77-
return 0
78-
fi
79-
for id in $GPU_IDS; do
80-
echo "Resetting GPU ID: $id"
81-
if ! rocm-smi --gpureset -d $id; then
82-
echo "WARNING: Unable to reset GPU $id"
69+
// Abort this if runs longer than 5 minutes
70+
timeout(time: 5, unit: 'MINUTES') {
71+
// Run the reset, but don't fail the build if anything is wrong
72+
def rc = sh(
73+
script: '''
74+
reset_all_gpus() {
75+
echo "Scanning GPUs..."
76+
GPU_IDS=$(rocm-smi | awk '/^[0-9]+[[:space:]]+[0-9]+[[:space:]]+0x/ { print $1 }')
77+
if [ -z "$GPU_IDS" ]; then
78+
echo "WARNING: No GPUs found to reset."
79+
return 0
8380
fi
84-
sleep 2
85-
done
86-
return 0
81+
for id in $GPU_IDS; do
82+
echo "Resetting GPU ID: $id"
83+
if ! rocm-smi --gpureset -d $id; then
84+
echo "WARNING: Unable to reset GPU $id"
85+
fi
86+
sleep 2
87+
done
88+
return 0
89+
}
90+
reset_all_gpus
91+
''',
92+
returnStatus: true
93+
)
94+
if (rc != 0) {
95+
echo "WARNING: reset_all_gpus exited with code ${rc}, but continuing anyway"
8796
}
88-
reset_all_gpus
89-
''',
90-
returnStatus: true
91-
)
92-
if (rc != 0) {
93-
echo "WARNING: reset_all_gpus exited with code ${rc}, but continuing anywat=y"
9497
}
9598
}
9699

97-
98100
def checkNodeHealth(Map opts = [:]) {
99101
advancedNodeCheck(
100102
doCleanWs: opts.get('doCleanWs', true),

0 commit comments

Comments
 (0)