@@ -66,35 +66,37 @@ def advancedNodeCheck(Map params) {
6666}
6767
6868def resetGPUs () {
69- // Run the reset, but don't fail the build if anything is wrong
70- def rc = sh(
71- script : '''
72- reset_all_gpus() {
73- echo "Scanning GPUs..."
74- GPU_IDS=$(rocm-smi | awk '/^[0-9]+[[:space:]]+[0-9]+[[:space:]]+0x/ { print $1 }')
75- if [ -z "$GPU_IDS" ]; then
76- echo "WARNING: No GPUs found to reset."
77- return 0
78- fi
79- for id in $GPU_IDS; do
80- echo "Resetting GPU ID: $id"
81- if ! rocm-smi --gpureset -d $id; then
82- echo "WARNING: Unable to reset GPU $id"
69+ // Abort this if runs longer than 5 minutes
70+ timeout(time : 5 , unit : ' MINUTES' ) {
71+ // Run the reset, but don't fail the build if anything is wrong
72+ def rc = sh(
73+ script : '''
74+ reset_all_gpus() {
75+ echo "Scanning GPUs..."
76+ GPU_IDS=$(rocm-smi | awk '/^[0-9]+[[:space:]]+[0-9]+[[:space:]]+0x/ { print $1 }')
77+ if [ -z "$GPU_IDS" ]; then
78+ echo "WARNING: No GPUs found to reset."
79+ return 0
8380 fi
84- sleep 2
85- done
86- return 0
81+ for id in $GPU_IDS; do
82+ echo "Resetting GPU ID: $id"
83+ if ! rocm-smi --gpureset -d $id; then
84+ echo "WARNING: Unable to reset GPU $id"
85+ fi
86+ sleep 2
87+ done
88+ return 0
89+ }
90+ reset_all_gpus
91+ ''' ,
92+ returnStatus : true
93+ )
94+ if (rc != 0 ) {
95+ echo " WARNING: reset_all_gpus exited with code ${ rc} , but continuing anyway"
8796 }
88- reset_all_gpus
89- ''' ,
90- returnStatus : true
91- )
92- if (rc != 0 ) {
93- echo " WARNING: reset_all_gpus exited with code ${ rc} , but continuing anywat=y"
9497 }
9598}
9699
97-
98100def checkNodeHealth (Map opts = [:]) {
99101 advancedNodeCheck(
100102 doCleanWs : opts. get(' doCleanWs' , true ),
0 commit comments